diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9d94d9e27b0dea56a4cb9809201bcf3f72f4f78..06784b4e0515d9cdcbaeae31660a0b1faf682703 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -142,6 +142,7 @@ set(_qattn_sm90_SRC
   "sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu"
 "sage_attention/qattn/attn_cuda_sm90.h"
 "sage_attention/qattn/attn_utils.cuh"
+"sage_attention/cuda_tensormap_shim.cuh"
 )
 
 # TODO: check if CLion support this:
diff --git a/build.toml b/build.toml
index 60e0eefd1beefca4cdcb0e017a7747d6b2b9e0b2..5859c1b4ec49cb40088f939bb0d26b1b77f462de 100644
--- a/build.toml
+++ b/build.toml
@@ -1,21 +1,20 @@
 [general]
 name = "sage_attention"
 universal = false
+cuda-minver = "12.4"
 
 [torch]
 src = [
   "torch-ext/torch_binding.cpp",
   "torch-ext/torch_binding.h",
 ]
-cuda-capabilities = [
-    "8.0", "9.0"
-]
 
 [kernel._qattn]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
-    "9.0"
+    "8.0", "8.9", "9.0a"
 ]
 src = [
     "sage_attention/cp_async.cuh",
@@ -27,6 +26,7 @@ src = [
     "sage_attention/reduction_utils.cuh",
     "sage_attention/wgmma.cuh",
     "sage_attention/utils.cuh",
+    "sage_attention/cuda_tensormap_shim.cuh",
 ]
 cxx-flags = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"]
 cuda-flags = [
@@ -43,6 +43,7 @@ cuda-flags = [
 [kernel._qattn_sm80]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
     "8.0"
 ]
@@ -68,6 +69,7 @@ cuda-flags = [
 [kernel._qattn_sm89]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
     "8.9",
 ]
@@ -100,8 +102,9 @@ cuda-flags = [
 [kernel._qattn_sm90]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
-    "9.0",
+    "9.0a",
 ]
 include = ["."]
 src = [
@@ -124,8 +127,9 @@ cuda-flags = [
 [kernel._fused]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
-    "9.0",
+    "8.0", "8.9", "9.0a",
 ]
 include = ["."]
 src = [
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py
@@ -0,0 +1,12 @@
+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+
+
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dd95ad76c739a10e50fa3582787c6f2b2b7719d
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9588c3ce2f047661a943ab8cef11327e921545cb
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..104e4122e8e0b2ab21b6ddc95c5b3f432d3a7736
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94f8d4797d4233c6a590b019bfe1950aca586f5c
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8047ade599c0570f9a869e5f1e4406f8ec35c444
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..8d67979089a7c21cfb93c3a6232245d4ed307168
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b577da1986b76b2571e8dd55412621e6fc85fe1a2f847bc0a5af9851bf388cf2
+size 26037568
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py
@@ -0,0 +1,983 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import torch.nn.functional as F
+
+from ._ops import ops
+
+
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+
+
+import subprocess
+import re
+
+
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+
+
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+
+
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py
@@ -0,0 +1,326 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+from typing import Optional
+
+from ._ops import ops
+
+
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    sm_scale *= 1.44269504
+
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+
+    return v_smoothed, vm
+
+
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py
@@ -0,0 +1,204 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if km is not None:
+        k = k - km
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+
+    return q_int8, q_scale, k_int8, k_scale
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py
@@ -0,0 +1,12 @@
+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+
+
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6a109e1ab12f4cc4b21a8573d182233206d911f
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2298b2decd5b4e5fcb76a03f1155d71e41292ad5
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7face6a40bbed59ea000478be0b1385bcf803b4
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e68303a9974569a9c4734e1a7029914b49e77bb
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bba733f559eaa89d8b281bc99f2b3992b38cc114
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..c45bfd2be6bcfb77552fff0eee73b5ebe19f6452
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d47c952dd9781283ff0dcbd533779de33b0bfa1966dcc0cc8accd0412217c1c5
+size 26553840
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py
@@ -0,0 +1,983 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import torch.nn.functional as F
+
+from ._ops import ops
+
+
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+
+
+import subprocess
+import re
+
+
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+
+
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+
+
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py
@@ -0,0 +1,326 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+from typing import Optional
+
+from ._ops import ops
+
+
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    sm_scale *= 1.44269504
+
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+
+    return v_smoothed, vm
+
+
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py
@@ -0,0 +1,204 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if km is not None:
+        k = k - km
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+
+    return q_int8, q_scale, k_int8, k_scale
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py
@@ -0,0 +1,12 @@
+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+
+
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4353512b2dfee72b0c59c3c743b89d1a6d21d53c
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..641efed3dee386c39c00ddfa4211d067d486a7ee
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1db66c5c7ef07688c5a4552f4b3d587c87edbcc5
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee3c8da8c38b79d4bad4db9d7a0389bf16df47e3
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdee849699e5bd19bfe27146e0a151b62d1fb069
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..84d12f5fd0187b990f9764433cd7efce32de6cf0
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28e181de0c6388653fb4b8b2d7347f1f547fc84fe7dc45bc66db9b1431d141bc
+size 26037392
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py
@@ -0,0 +1,983 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import torch.nn.functional as F
+
+from ._ops import ops
+
+
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+
+
+import subprocess
+import re
+
+
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+
+
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+
+
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py
@@ -0,0 +1,326 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+from typing import Optional
+
+from ._ops import ops
+
+
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    sm_scale *= 1.44269504
+
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+
+    return v_smoothed, vm
+
+
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py
@@ -0,0 +1,204 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if km is not None:
+        k = k - km
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+
+    return q_int8, q_scale, k_int8, k_scale
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py
@@ -0,0 +1,12 @@
+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+
+
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad915a1553f70a034d95cf30d35f65f414cdddf4
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b15635cec94a98d3a8ae65de7d8470620cdc1ca5
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d862b1f4b01b67e8739eace8ac79a35d6a0fb55e
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a90a9e0a4b0b2c71e99fb920efbddfbe0d73c8b3
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f05baf4ed4fda1f06cdf6559aeac4612bf2413df
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..8f2d058f3d5e4c696f8dafa57189b7263e0c607d
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:826ab66e6c33b3b2b17c30371934a55e972d560197c5492f4dedf6fcc29f1a1e
+size 26553920
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py
@@ -0,0 +1,983 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import torch.nn.functional as F
+
+from ._ops import ops
+
+
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+
+
+import subprocess
+import re
+
+
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+
+
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+
+
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py
@@ -0,0 +1,326 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+from typing import Optional
+
+from ._ops import ops
+
+
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    sm_scale *= 1.44269504
+
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+
+    return v_smoothed, vm
+
+
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py
@@ -0,0 +1,204 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if km is not None:
+        k = k - km
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+
+    return q_int8, q_scale, k_int8, k_scale
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py
@@ -0,0 +1,12 @@
+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+
+
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dfb4a20911742e46041373d8bd10f01e6a83afa
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d6048f0206f7d3fe5f01106837b0145a66b1df7
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..674578609f75fb9d129c302f5e2efd5e9dd88869
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b1288e6a808145a21813915428772c5151fffe9
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d7155cabca1497530d3d266f4ec3b647f995f95
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..736613dcbbc913f7a1c538e9d383e65f98fe5f52
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:824faeacc05dc7d676acaa9a005d5f4d7e62f47c361eb58a085f020e21fde29e
+size 26612144
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py
@@ -0,0 +1,983 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import torch.nn.functional as F
+
+from ._ops import ops
+
+
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+
+
+import subprocess
+import re
+
+
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+
+
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+
+
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+
+
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    torch.cuda.set_device(v.device)
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+
+    head_dim_og = q.size(-1)
+
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+
+    o = o[..., :head_dim_og]
+
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant.py
@@ -0,0 +1,326 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+from typing import Optional
+
+from ._ops import ops
+
+
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    sm_scale *= 1.44269504
+
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+
+    return q_int8, q_scale, k_int8, k_scale
+
+
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+
+    return v_smoothed, vm
+
+
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant_per_thread.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant_per_thread.py
@@ -0,0 +1,204 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):      
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    if km is not None:
+        k = k - km
+
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+
+    return q_int8, q_scale, k_int8, k_scale
\ No newline at end of file
diff --git a/nix-build.log b/nix-build.log
new file mode 100644
index 0000000000000000000000000000000000000000..6092aaedcfeaaa3a004fdf85a1e9fb8f117f5fda
--- /dev/null
+++ b/nix-build.log
@@ -0,0 +1,13519 @@
+warning: Git tree '/home/ec2-user/dev/sage_attention' is dirty
+warning: not writing modified lock file of flake 'git+file:///home/ec2-user/dev/sage_attention':
+• Updated input 'kernel-builder':
+    'github:huggingface/kernel-builder/967c94ec67830c5d85dc981407478939edd169f9?narHash=sha256-0EbrJkAx7yTOOjfJQFbk%2BBLo4MYfsD6JgRKibRYioo8%3D' (2025-09-25)
+  → 'github:huggingface/kernel-builder/9532ae833d245c03cb4daebd510e89e14cd27e7d?narHash=sha256-LYeNDsudfsy7extv59oyyirOv0%2BCG4hPIvTKnUaG7m0%3D' (2025-09-30)
+evaluation warning: `rev` argument of `genFlakeOutputs` is deprecated, pass `self` as follows:
+
+                    kernel-builder.lib.genFlakeOutputs {
+                      inherit self;
+                      path = ./.;
+                    };
+these 7 derivations will be built:
+  /nix/store/5i3gnhgvv278c7m9q3x3agksl5jab9ck-sage_attention-torch-ext.drv
+  /nix/store/bawig99wpvl8dvmdb3znykgir3w1nw15-sage_attention-torch-ext.drv
+  /nix/store/jzgwmpf18h1rrvfhclhq4say6m90j7y4-sage_attention-torch-ext.drv
+  /nix/store/msa3cr0rrgkm0dagqbcs67k8s169474b-sage_attention-torch-ext.drv
+  /nix/store/qckl1ak5l089b2sakw4h2whnd6mg16ld-sage_attention-torch-ext.drv
+  /nix/store/xq28asxbqp6g7x8bcz92xl849prg2899-torch-ext-bundle.drv
+  /nix/store/rkzh9xwk6kdgl1by4xfwmyvb5arpfqby-build-and-copy.drv
+building '/nix/store/5i3gnhgvv278c7m9q3x3agksl5jab9ck-sage_attention-torch-ext.drv'...
+building '/nix/store/bawig99wpvl8dvmdb3znykgir3w1nw15-sage_attention-torch-ext.drv'...
+building '/nix/store/jzgwmpf18h1rrvfhclhq4say6m90j7y4-sage_attention-torch-ext.drv'...
+building '/nix/store/msa3cr0rrgkm0dagqbcs67k8s169474b-sage_attention-torch-ext.drv'...
+building '/nix/store/qckl1ak5l089b2sakw4h2whnd6mg16ld-sage_attention-torch-ext.drv'...
+sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh
+sage_attention-torch-ext> Sourcing setup-cuda-hook
+sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh
+sage_attention-torch-ext> Sourcing setup-cuda-hook
+sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh
+sage_attention-torch-ext> Sourcing setup-cuda-hook
+sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh
+sage_attention-torch-ext> Sourcing setup-cuda-hook
+sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh
+sage_attention-torch-ext> Sourcing setup-cuda-hook
+sage_attention-torch-ext> Running phase: unpackPhase
+sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source
+sage_attention-torch-ext> source root is source
+sage_attention-torch-ext> Running phase: unpackPhase
+sage_attention-torch-ext> Running phase: patchPhase
+sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source
+sage_attention-torch-ext> source root is source
+sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase
+sage_attention-torch-ext> Running phase: unpackPhase
+sage_attention-torch-ext> Running phase: patchPhase
+sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source
+sage_attention-torch-ext> Running phase: configurePhase
+sage_attention-torch-ext> source root is source
+sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase
+sage_attention-torch-ext> Running phase: unpackPhase
+sage_attention-torch-ext> Executing setupCUDAToolkitCompilers
+sage_attention-torch-ext> Running phase: patchPhase
+sage_attention-torch-ext> fixing cmake files...
+sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source
+sage_attention-torch-ext> Running phase: configurePhase
+sage_attention-torch-ext> source root is source
+sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase
+sage_attention-torch-ext> Running phase: unpackPhase
+sage_attention-torch-ext> Executing setupCUDAToolkitCompilers
+sage_attention-torch-ext> Running phase: patchPhase
+sage_attention-torch-ext> fixing cmake files...
+sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source
+sage_attention-torch-ext> Running phase: configurePhase
+sage_attention-torch-ext> source root is source
+sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase
+sage_attention-torch-ext> Executing setupCUDAToolkitCompilers
+sage_attention-torch-ext> Running phase: patchPhase
+sage_attention-torch-ext> fixing cmake files...
+sage_attention-torch-ext> Running phase: configurePhase
+sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev\;/nix/store/bcvj4g3f3n6cpb6czcb5k8zdmyd94fwi-cuda_nvtx-12.6.77-lib\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/f87x0n0gi2d7rxh1ja92za2ixcw60q2p-cuda_nvtx-12.6.77\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages
+sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase
+sage_attention-torch-ext> Executing setupCUDAToolkitCompilers
+sage_attention-torch-ext> fixing cmake files...
+sage_attention-torch-ext> Running phase: configurePhase
+sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages
+sage_attention-torch-ext> Executing setupCUDAToolkitCompilers
+sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev/include\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev/include\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev/include\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev/include\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev/include\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev/include\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev/include\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev/include\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev/include\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev/include\;/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev/include\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev/include -DCUDAToolkit_ROOT=/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86\;/nix/store/q2al0drhrl0yxk97xbsjl8d0h25kmsq9-libcurand-10.3.10.19-lib\;/nix/store/ax1ssn45048qbmyy19basgv6q64y5jy0-cuda_cupti-12.9.79\;/nix/store/m09542l6q83flp3asv2r4j3wcbjqksvg-libcufile-1.14.1.1-static\;/nix/store/b3wbcra9cziq8bwf3yhmj2nn1mf5bqy2-cuda_cudart-12.9.79-lib\;/nix/store/j5kp5fg9mn6hhslk18wbmskc7v96l353-cuda_cupti-12.9.79-static\;/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev\;/nix/store/3s79bz4ldkhlks6jf9a2jd4r34y6018b-libcurand-10.3.10.19\;/nix/store/v48xzq66pzmygxqkws17n9nvpa7lad9d-cuda_nvml_dev-12.9.79\;/nix/store/6via2axi1n31n685jii6dwaiqca8b2rc-cuda_nvcc-12.9.86-static\;/nix/store/v0hx9fqdlmz9kvjd9sqr2zc141ny10yn-cuda_profiler_api-12.9.79\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev\;/nix/store/8cig7k11qv5g8x0j8n2mbdfzwrnf7cg2-cuda_cudart-12.9.79-stubs\;/nix/store/xg8pj5m74n2h3v8kgxbvmbpcl90rzmlx-cudnn-9.11.0.98-static\;/nix/store/v4b7mkhyq1akczzkcyynj7y9c61l9dc7-cuda_cudart-12.9.79-static\;/nix/store/hw2swakbrvi4innrymcw8i2m98p73br0-cuda_cupti-12.9.79-sample\;/nix/store/s1i2kadnni2m4skpzzqzfzc3bpmrxi7p-libcusparse-12.5.10.65-lib\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev\;/nix/store/0a83zdhkh2i9d97r4zqdn8fi8vn4wfk3-libcublas-12.9.1.4-static\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev\;/nix/store/jnhjz87sm9nbnb72n54jj2l99szrzpg2-libcusparse-12.5.10.65\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev\;/nix/store/d1m6c5i6y6ncjygpdmv1b4pmd91hvjr2-cuda_cupti-12.9.79-lib\;/nix/store/49p6af3v11dcxvq9andr6l8csa2sr4j4-cuda_nvrtc-12.9.86-static\;/nix/store/bfygrgghga26l7br5d5j3h6hd1s21rkn-cudnn-9.11.0.98\;/nix/store/a6an9chi5dvjsybrfrxql0bn76xswzpa-libcufft-11.4.1.4\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev\;/nix/store/7zy91byrxpnyzhjlwham2gqyir2x6f54-libcusolver-11.7.5.82-lib\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev\;/nix/store/cx0hyla7fkqqc5hh1gn4hkarjyjvbjhf-libcusparse-12.5.10.65-static\;/nix/store/3yi8kx62nklnyn77zn4z23hi03l9c7ff-libcusolver-11.7.5.82-static\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev\;/nix/store/86nq76ks8vlgjdsnh1hkskyfw7mm3plc-cuda_cccl-12.9.27\;/nix/store/01ywykdxfkvp64318anifgx7zaavz9ql-cuda_nvml_dev-12.9.79-lib\;/nix/store/qv2m9i0nby2p03xx37mkkm84dlqb9s84-cuda_cudart-12.9.79\;/nix/store/a09saq5rl5jxbgv9gqllx0080ypjk00x-libcufile-1.14.1.1-lib\;/nix/store/0l18n4dhavr0p4rk0nyqqjr8paacak13-libcufile-1.14.1.1\;/nix/store/r8ly0w88qv4gw3lhd784ha0ag221c23s-cuda_nvrtc-12.9.86-lib\;/nix/store/rngn6cls1blhilrw78xb3pjgwghibhzk-libcurand-10.3.10.19-static\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev\;/nix/store/ikw7sqic4kknjkp50dr54khgs06q1hbv-cuda_nvml_dev-12.9.79-static\;/nix/store/bzdnjn29xj8a73wg16qrz0sswi9svp0x-libcublas-12.9.1.4\;/nix/store/62hqkwasnanq5i1j63z4clc0s4c61k1r-libcufft-11.4.1.4-static\;/nix/store/5sjldyn2vmm4ky24v1f9ggs0hps496q3-libcusolver-11.7.5.82\;/nix/store/9c924z3749bfm078bwq4ad12kjz46pjf-libcufft-11.4.1.4-lib\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev\;/nix/store/c1kdvq8xqqkwzzazl99w20h4x9z0f9pc-libcublas-12.9.1.4-lib\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev\;/nix/store/h6kzw3gvlv4sa0apb4fflpjlirhj72ga-cudnn-9.11.0.98-lib\;/nix/store/f5gvpjis5y727lw6vzr2h1zkb3hm08k2-cuda_nvrtc-12.9.86 -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages
+sage_attention-torch-ext> fixing cmake files...
+sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages
+sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/5f6dvklv5d0mvygrrf0vzp0smcn7kk01-cuda_nvtx-12.8.90\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/wa9pr3485k3mw8jhv7i9kfzjrqmdl5bb-cuda_nvtx-12.8.90-lib\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages
+sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info
+sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info
+sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0
+sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info
+sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done
+sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ - skipped
+sage_attention-torch-ext> -- Detecting CXX compile features
+sage_attention-torch-ext> -- Detecting CXX compile features - done
+sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ - skipped
+sage_attention-torch-ext> -- Detecting CXX compile features
+sage_attention-torch-ext> -- Detecting CXX compile features - done
+sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done
+sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped
+sage_attention-torch-ext> -- Detecting CXX compile features
+sage_attention-torch-ext> -- Detecting CXX compile features - done
+sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped
+sage_attention-torch-ext> -- Detecting CXX compile features
+sage_attention-torch-ext> -- Detecting CXX compile features - done
+sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps
+sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped
+sage_attention-torch-ext> -- Detecting CXX compile features
+sage_attention-torch-ext> -- Detecting CXX compile features - done
+sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps
+sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps
+sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps
+sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps
+sage_attention-torch-ext> -- Found Python: /nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed
+sage_attention-torch-ext> -- Found Python: /nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed
+sage_attention-torch-ext> -- Found Python: /nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed
+sage_attention-torch-ext> -- Found Python: /nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed
+sage_attention-torch-ext> -- Found Python: /nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed
+sage_attention-torch-ext> -- Found CUDA: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 (found version "12.6")
+sage_attention-torch-ext> -- Found CUDA: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86 (found version "12.9")
+sage_attention-torch-ext> -- Found CUDA: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 (found version "12.6")
+sage_attention-torch-ext> -- Found CUDA: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 (found version "12.8")
+sage_attention-torch-ext> -- Found CUDA: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 (found version "12.8")
+sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.6.85 with host compiler GNU 13.4.0
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info
+sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.9.86 with host compiler GNU 14.3.0
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info
+sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.6.85 with host compiler GNU 13.4.0
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info
+sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.8.93 with host compiler GNU 14.3.0
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info
+sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.8.93 with host compiler GNU 14.3.0
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done
+sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc - skipped
+sage_attention-torch-ext> -- Detecting CUDA compile features
+sage_attention-torch-ext> -- Detecting CUDA compile features - done
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done
+sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include (found version "12.6.85")
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD
+sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/bin/nvcc - skipped
+sage_attention-torch-ext> -- Detecting CUDA compile features
+sage_attention-torch-ext> -- Detecting CUDA compile features - done
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done
+sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include (found version "12.9.86")
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD
+sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc - skipped
+sage_attention-torch-ext> -- Detecting CUDA compile features
+sage_attention-torch-ext> -- Detecting CUDA compile features - done
+sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done
+sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc - skipped
+sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include (found version "12.6.85")
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD
+sage_attention-torch-ext> -- Detecting CUDA compile features
+sage_attention-torch-ext> -- Detecting CUDA compile features - done
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads
+sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include (found version "12.8.93")
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads
+sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc - skipped
+sage_attention-torch-ext> -- Detecting CUDA compile features
+sage_attention-torch-ext> -- Detecting CUDA compile features - done
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found
+sage_attention-torch-ext> -- Looking for pthread_create in pthread
+sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include (found version "12.8.93")
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found
+sage_attention-torch-ext> -- Looking for pthread_create in pthread
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads
+sage_attention-torch-ext> -- Looking for pthread_create in pthread - found
+sage_attention-torch-ext> -- Found Threads: TRUE
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found
+sage_attention-torch-ext> -- Looking for pthread_create in pthread
+sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found
+sage_attention-torch-ext> -- Looking for pthread_create in pthread
+sage_attention-torch-ext> -- Looking for pthread_create in pthread - found
+sage_attention-torch-ext> -- Found Threads: TRUE
+sage_attention-torch-ext> -- Looking for pthread_create in pthread - found
+sage_attention-torch-ext> -- Found Threads: TRUE
+sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found
+sage_attention-torch-ext> -- Looking for pthread_create in pthread
+sage_attention-torch-ext> -- Looking for pthread_create in pthread - found
+sage_attention-torch-ext> -- Found Threads: TRUE
+sage_attention-torch-ext> -- Looking for pthread_create in pthread - found
+sage_attention-torch-ext> -- Found Threads: TRUE
+sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.6
+sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc
+sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85
+sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.9
+sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/bin/nvcc
+sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86
+sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.6
+sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc
+sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85
+sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8
+sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc
+sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93
+sage_attention-torch-ext> -- PyTorch: Header version is: 12.6
+sage_attention-torch-ext> -- Found Python: /nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter
+sage_attention-torch-ext> CMake Warning at /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message):
+sage_attention-torch-ext>   Failed to compute shorthash for libnvrtc.so
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support
+sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
+sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support
+sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support
+sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90
+sage_attention-torch-ext> CMake Warning at /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
+sage_attention-torch-ext>   static library kineto_LIBRARY-NOTFOUND not found.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Found Torch: /nix/store/pg32mpjmckfs38anjzgyvk2ljfw12pb3-python3.13-torch-2.8.0-lib/lib/libtorch.so
+sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0
+sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0
+sage_attention-torch-ext> -- PyTorch: Header version is: 12.9
+sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8
+sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc
+sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93
+sage_attention-torch-ext> -- Found Python: /nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter
+sage_attention-torch-ext> CMake Warning at /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message):
+sage_attention-torch-ext>   Failed to compute shorthash for libnvrtc.so
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support
+sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
+sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support
+sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support
+sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120
+sage_attention-torch-ext> -- PyTorch: Header version is: 12.6
+sage_attention-torch-ext> CMake Warning at /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
+sage_attention-torch-ext>   static library kineto_LIBRARY-NOTFOUND not found.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Found Torch: /nix/store/zccgvlbr93bhyia3sr9f2mddmkp2jyx7-python3.13-torch-2.8.0-lib/lib/libtorch.so
+sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0
+sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0
+sage_attention-torch-ext> -- PyTorch: Header version is: 12.8
+sage_attention-torch-ext> -- Found Python: /nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter
+sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message):
+sage_attention-torch-ext>   Failed to compute shorthash for libnvrtc.so
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> CMake Warning (dev) at /nix/store/0vnarm4qjnj16dr3zj9kwq6bn79c0icn-cmake-3.31.7/share/cmake-3.31/Modules/FindPackageHandleStandardArgs.cmake:441 (message):
+sage_attention-torch-ext>   The package name passed to `find_package_handle_standard_args` (nvtx3) does
+sage_attention-torch-ext>   not match the name of the calling package (Caffe2).  This can lead to
+sage_attention-torch-ext>   problems in calling code that expects `find_package` result variables
+sage_attention-torch-ext>   (e.g., `_FOUND`) to follow a certain pattern.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:184 (find_package_handle_standard_args)
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> This warning is for project developers.  Use -Wno-dev to suppress it.
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Could NOT find nvtx3 (missing: nvtx3_dir)
+sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:190 (message):
+sage_attention-torch-ext>   Cannot find NVTX3, find old NVTX instead
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support
+sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
+sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support
+sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support
+sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/utils.cmake:328 (message):
+sage_attention-torch-ext>   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
+sage_attention-torch-ext>   to cmake instead of implicitly setting it as an env variable.  This will
+sage_attention-torch-ext>   become a FATAL_ERROR in future version of pytorch.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:337 (torch_cuda_get_nvcc_gencode_flag)
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90
+sage_attention-torch-ext> -- Found Python: /nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter
+sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message):
+sage_attention-torch-ext>   Failed to compute shorthash for libnvrtc.so
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support
+sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
+sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support
+sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support
+sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120
+sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
+sage_attention-torch-ext>   static library kineto_LIBRARY-NOTFOUND not found.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Found Torch: /nix/store/8sicfhvzq84gnxiwybyjgp80pcynamzn-python3.13-torch-2.7.1-lib/lib/libtorch.so
+sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0
+sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0
+sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
+sage_attention-torch-ext>   static library kineto_LIBRARY-NOTFOUND not found.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Found Torch: /nix/store/mrq1wi2biib2p1mks17g8g5sc4fd492r-python3.13-torch-2.8.0-lib/lib/libtorch.so
+sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0
+sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0
+sage_attention-torch-ext> -- PyTorch: Header version is: 12.8
+sage_attention-torch-ext> -- Found Python: /nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter
+sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message):
+sage_attention-torch-ext>   Failed to compute shorthash for libnvrtc.so
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> CMake Warning (dev) at /nix/store/0vnarm4qjnj16dr3zj9kwq6bn79c0icn-cmake-3.31.7/share/cmake-3.31/Modules/FindPackageHandleStandardArgs.cmake:441 (message):
+sage_attention-torch-ext>   The package name passed to `find_package_handle_standard_args` (nvtx3) does
+sage_attention-torch-ext>   not match the name of the calling package (Caffe2).  This can lead to
+sage_attention-torch-ext>   problems in calling code that expects `find_package` result variables
+sage_attention-torch-ext>   (e.g., `_FOUND`) to follow a certain pattern.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:184 (find_package_handle_standard_args)
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> This warning is for project developers.  Use -Wno-dev to suppress it.
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Could NOT find nvtx3 (missing: nvtx3_dir)
+sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:190 (message):
+sage_attention-torch-ext>   Cannot find NVTX3, find old NVTX instead
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support
+sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
+sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support
+sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support
+sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/utils.cmake:328 (message):
+sage_attention-torch-ext>   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
+sage_attention-torch-ext>   to cmake instead of implicitly setting it as an env variable.  This will
+sage_attention-torch-ext>   become a FATAL_ERROR in future version of pytorch.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:337 (torch_cuda_get_nvcc_gencode_flag)
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include)
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120
+sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
+sage_attention-torch-ext>   static library kineto_LIBRARY-NOTFOUND not found.
+sage_attention-torch-ext> Call Stack (most recent call first):
+sage_attention-torch-ext>   /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found)
+sage_attention-torch-ext>   CMakeLists.txt:30 (find_package)
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Found Torch: /nix/store/35sj4in2ddx47klyg96qmkpd4vh8py94-python3.13-torch-2.7.1-lib/lib/libtorch.so
+sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0
+sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0
+sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a
+sage_attention-torch-ext> -- Configuring done (9.3s)
+sage_attention-torch-ext> -- Generating done (0.0s)
+sage_attention-torch-ext> CMake Warning:
+sage_attention-torch-ext>   Manually-specified variables were not used by the project:
+sage_attention-torch-ext> 
+sage_attention-torch-ext>     BUILD_TESTING
+sage_attention-torch-ext>     CMAKE_EXPORT_NO_PACKAGE_REGISTRY
+sage_attention-torch-ext>     CMAKE_INSTALL_BINDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_DOCDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INCLUDEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INFODIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBEXECDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LOCALEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_MANDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_SBINDIR
+sage_attention-torch-ext>     CMAKE_POLICY_DEFAULT_CMP0025
+sage_attention-torch-ext>     CUDAToolkit_INCLUDE_DIR
+sage_attention-torch-ext>     PROTOC_EXE
+sage_attention-torch-ext>     PYBIND11_PYTHONLIBS_OVERWRITE
+sage_attention-torch-ext>     PYTHON_EXECUTABLE
+sage_attention-torch-ext>     PYTHON_INCLUDE_DIR
+sage_attention-torch-ext>     PYTHON_SITE_PACKAGES
+sage_attention-torch-ext>     Protobuf_PROTOC_EXE
+sage_attention-torch-ext>     Protobuf_PROTOC_EXECUTABLE
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Build files have been written to: /build/source/build
+sage_attention-torch-ext> cmake: enabled parallel building
+sage_attention-torch-ext> cmake: enabled parallel installing
+sage_attention-torch-ext> Running phase: buildPhase
+sage_attention-torch-ext> build flags: -j21
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0
+sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Configuring done (9.5s)
+sage_attention-torch-ext> -- Generating done (0.0s)
+sage_attention-torch-ext> CMake Warning:
+sage_attention-torch-ext>   Manually-specified variables were not used by the project:
+sage_attention-torch-ext> 
+sage_attention-torch-ext>     BUILD_TESTING
+sage_attention-torch-ext>     CMAKE_EXPORT_NO_PACKAGE_REGISTRY
+sage_attention-torch-ext>     CMAKE_INSTALL_BINDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_DOCDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INCLUDEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INFODIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBEXECDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LOCALEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_MANDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_SBINDIR
+sage_attention-torch-ext>     CMAKE_POLICY_DEFAULT_CMP0025
+sage_attention-torch-ext>     CUDAToolkit_INCLUDE_DIR
+sage_attention-torch-ext>     PROTOC_EXE
+sage_attention-torch-ext>     PYBIND11_PYTHONLIBS_OVERWRITE
+sage_attention-torch-ext>     PYTHON_EXECUTABLE
+sage_attention-torch-ext>     PYTHON_INCLUDE_DIR
+sage_attention-torch-ext>     PYTHON_SITE_PACKAGES
+sage_attention-torch-ext>     Protobuf_PROTOC_EXE
+sage_attention-torch-ext>     Protobuf_PROTOC_EXECUTABLE
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Build files have been written to: /build/source/build
+sage_attention-torch-ext> cmake: enabled parallel building
+sage_attention-torch-ext> cmake: enabled parallel installing
+sage_attention-torch-ext> Running phase: buildPhase
+sage_attention-torch-ext> build flags: -j21
+sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a
+sage_attention-torch-ext> -- Configuring done (9.6s)
+sage_attention-torch-ext> -- Generating done (0.0s)
+sage_attention-torch-ext> CMake Warning:
+sage_attention-torch-ext>   Manually-specified variables were not used by the project:
+sage_attention-torch-ext> 
+sage_attention-torch-ext>     BUILD_TESTING
+sage_attention-torch-ext>     CMAKE_EXPORT_NO_PACKAGE_REGISTRY
+sage_attention-torch-ext>     CMAKE_INSTALL_BINDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_DOCDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INCLUDEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INFODIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBEXECDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LOCALEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_MANDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_SBINDIR
+sage_attention-torch-ext>     CMAKE_POLICY_DEFAULT_CMP0025
+sage_attention-torch-ext>     CUDAToolkit_INCLUDE_DIR
+sage_attention-torch-ext>     PROTOC_EXE
+sage_attention-torch-ext>     PYBIND11_PYTHONLIBS_OVERWRITE
+sage_attention-torch-ext>     PYTHON_EXECUTABLE
+sage_attention-torch-ext>     PYTHON_INCLUDE_DIR
+sage_attention-torch-ext>     PYTHON_SITE_PACKAGES
+sage_attention-torch-ext>     Protobuf_PROTOC_EXE
+sage_attention-torch-ext>     Protobuf_PROTOC_EXECUTABLE
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Build files have been written to: /build/source/build
+sage_attention-torch-ext> cmake: enabled parallel building
+sage_attention-torch-ext> cmake: enabled parallel installing
+sage_attention-torch-ext> Running phase: buildPhase
+sage_attention-torch-ext> build flags: -j21
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a
+sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0
+sage_attention-torch-ext> -- Configuring done (9.7s)
+sage_attention-torch-ext> -- Generating done (0.0s)
+sage_attention-torch-ext> CMake Warning:
+sage_attention-torch-ext>   Manually-specified variables were not used by the project:
+sage_attention-torch-ext> 
+sage_attention-torch-ext>     BUILD_TESTING
+sage_attention-torch-ext>     CMAKE_EXPORT_NO_PACKAGE_REGISTRY
+sage_attention-torch-ext>     CMAKE_INSTALL_BINDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_DOCDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INCLUDEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INFODIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBEXECDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LOCALEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_MANDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_SBINDIR
+sage_attention-torch-ext>     CMAKE_POLICY_DEFAULT_CMP0025
+sage_attention-torch-ext>     CUDAToolkit_INCLUDE_DIR
+sage_attention-torch-ext>     PROTOC_EXE
+sage_attention-torch-ext>     PYBIND11_PYTHONLIBS_OVERWRITE
+sage_attention-torch-ext>     PYTHON_EXECUTABLE
+sage_attention-torch-ext>     PYTHON_INCLUDE_DIR
+sage_attention-torch-ext>     PYTHON_SITE_PACKAGES
+sage_attention-torch-ext>     Protobuf_PROTOC_EXE
+sage_attention-torch-ext>     Protobuf_PROTOC_EXECUTABLE
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Build files have been written to: /build/source/build
+sage_attention-torch-ext> cmake: enabled parallel building
+sage_attention-torch-ext> cmake: enabled parallel installing
+sage_attention-torch-ext> Running phase: buildPhase
+sage_attention-torch-ext> build flags: -j21
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9
+sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0
+sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a
+sage_attention-torch-ext> -- Configuring done (9.8s)
+sage_attention-torch-ext> -- Generating done (0.0s)
+sage_attention-torch-ext> CMake Warning:
+sage_attention-torch-ext>   Manually-specified variables were not used by the project:
+sage_attention-torch-ext> 
+sage_attention-torch-ext>     BUILD_TESTING
+sage_attention-torch-ext>     CMAKE_EXPORT_NO_PACKAGE_REGISTRY
+sage_attention-torch-ext>     CMAKE_INSTALL_BINDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_DOCDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INCLUDEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_INFODIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LIBEXECDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_LOCALEDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_MANDIR
+sage_attention-torch-ext>     CMAKE_INSTALL_SBINDIR
+sage_attention-torch-ext>     CMAKE_POLICY_DEFAULT_CMP0025
+sage_attention-torch-ext>     CUDAToolkit_INCLUDE_DIR
+sage_attention-torch-ext>     PROTOC_EXE
+sage_attention-torch-ext>     PYBIND11_PYTHONLIBS_OVERWRITE
+sage_attention-torch-ext>     PYTHON_EXECUTABLE
+sage_attention-torch-ext>     PYTHON_INCLUDE_DIR
+sage_attention-torch-ext>     PYTHON_SITE_PACKAGES
+sage_attention-torch-ext>     Protobuf_PROTOC_EXE
+sage_attention-torch-ext>     Protobuf_PROTOC_EXECUTABLE
+sage_attention-torch-ext> 
+sage_attention-torch-ext> 
+sage_attention-torch-ext> -- Build files have been written to: /build/source/build
+sage_attention-torch-ext> cmake: enabled parallel building
+sage_attention-torch-ext> cmake: enabled parallel installing
+sage_attention-torch-ext> Running phase: buildPhase
+sage_attention-torch-ext> build flags: -j21
+sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o
+sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o
+sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o
+sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o
+sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o
+sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 441.985 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 435.474 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 542.186 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 537.633 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 556.850 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 558.532 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 525.842 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 527.640 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 453.136 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 455.551 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 560.086 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 566.771 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 451.886 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 452.820 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 563.157 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 552.506 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 223.644 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 223.313 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 228.182 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 228.007 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.210 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.780 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 228.300 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 227.566 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.498 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 244.960 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 249.012 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 249.548 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 244.845 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 244.634 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 251.408 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.002 ms
+sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced
+sage_attention-torch-ext>     half *sO = (half*)smem_;
+sage_attention-torch-ext>           ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 586.615 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 588.685 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 592.250 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 593.767 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 574.520 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 573.749 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 586.361 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 582.063 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 522.972 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 516.314 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 629.345 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 626.246 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 502.294 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 503.470 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 619.844 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 616.534 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 238.306 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.621 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.968 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 249.797 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 240.832 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.605 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.794 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.869 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 262.329 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 262.374 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 270.911 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 270.832 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 263.277 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 263.914 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 272.102 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 269.136 ms
+sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced
+sage_attention-torch-ext>     half *sO = (half*)smem_;
+sage_attention-torch-ext>           ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 52.664 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 47.716 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 41.273 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 38.776 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.406 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.111 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.007 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.035 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 6.928 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 6.485 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 5.804 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 5.653 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.750 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.113 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.936 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.215 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.871 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.819 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.988 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.837 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 20.280 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 14.116 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 13.736 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 13.448 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 18.929 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 13.647 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 13.319 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 13.264 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 17.227 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.880 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.378 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.197 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 16.636 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.659 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.735 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.882 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 17.822 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.403 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.426 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.251 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 18.305 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.513 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.229 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.345 ms
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 22.466 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 20.754 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 16.622 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 16.662 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.590 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.369 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.286 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.288 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.194 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.170 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.785 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.792 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.000 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.105 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.117 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.127 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.123 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.698 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.295 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.133 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.679 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.073 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.101 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.066 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.673 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.110 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.077 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.057 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.517 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.116 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.066 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.118 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.597 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.028 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.165 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.086 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.971 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.258 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.246 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.285 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.975 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.326 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.259 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.217 ms
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 29.770 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 27.921 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 22.255 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 22.250 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.477 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.404 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.402 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.398 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.334 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.262 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.805 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.876 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.098 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.180 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.230 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.119 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.241 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.773 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.295 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.169 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 12.442 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.177 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.164 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.109 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.826 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.233 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.210 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.194 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.782 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.427 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.885 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.938 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.257 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.465 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.747 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.217 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.086 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.344 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.385 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.296 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.133 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.429 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.385 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.303 ms
+sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 631.269 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 678.510 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 889.763 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 842.414 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 848.507 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 860.566 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 852.066 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 857.813 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 798.712 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 814.182 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 766.806 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 563.761 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 460.277 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 462.993 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 565.805 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 557.043 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.177 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 223.149 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 228.610 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 229.459 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 224.016 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 223.754 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 231.030 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 231.032 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.149 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.080 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.757 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.788 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.808 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.484 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 256.288 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 255.649 ms
+sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 798.894 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 793.628 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 737.636 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 579.885 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 582.862 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 569.953 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 564.402 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 562.262 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 487.942 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 539.907 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 618.986 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 629.778 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 496.290 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 494.479 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 609.734 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 599.355 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.366 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.087 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 255.448 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.353 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.158 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.570 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 247.500 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.915 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.784 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.127 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 269.930 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 284.631 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 319.602 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.218 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 268.477 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 266.415 ms
+sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 991.616 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 805.593 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 587.577 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 582.580 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 861.558 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 573.405 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 570.518 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 572.015 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 501.662 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 502.550 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 621.856 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 617.546 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 506.660 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 500.017 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 617.184 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 608.361 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 244.249 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.320 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 294.735 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 256.622 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.522 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.401 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.182 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.771 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 275.067 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 272.300 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 276.028 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 278.538 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 272.368 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 270.965 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 278.318 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 276.783 ms
+sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 950.424 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 938.339 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 942.385 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 951.773 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 941.110 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 948.921 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 953.087 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 937.847 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 796.084 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 790.276 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1005.200 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1014.114 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 792.773 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 810.731 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1016.541 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1015.399 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 512.495 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 519.312 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 459.927 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 403.606 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 512.221 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 498.943 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 451.517 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 448.368 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 558.783 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 556.302 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 552.468 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 548.704 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 549.656 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 548.578 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 554.915 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 554.169 ms
+sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 33.055 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 29.879 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 25.412 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 24.254 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.621 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.517 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.419 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.453 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 4.394 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 4.362 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 3.675 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 3.641 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.795 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.833 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.808 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.779 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.779 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.693 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.811 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.727 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.304 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.983 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.975 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.919 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.034 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.806 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.660 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.769 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.120 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.379 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.839 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.843 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 10.634 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.796 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.711 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.701 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.606 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.199 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.214 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.120 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.220 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.031 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.065 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.001 ms
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 32.115 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 30.290 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 23.510 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 24.156 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 4.940 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 4.758 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 4.801 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 4.800 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 4.554 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 4.284 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.852 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.773 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.896 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.918 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.213 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.558 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.731 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.301 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.800 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.677 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 15.202 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.401 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.767 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.408 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 15.133 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.391 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.290 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.324 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 13.323 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.568 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.806 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.917 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 13.234 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.500 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.014 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.819 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 13.673 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.071 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.839 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.729 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 13.484 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.911 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.806 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.552 ms
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 30.458 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 28.607 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 22.856 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 22.959 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.522 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.464 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.487 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.405 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.377 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.360 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.899 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.911 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.329 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.396 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.795 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.739 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.421 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.937 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.493 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.361 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.107 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.381 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.306 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.143 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 11.281 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.424 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.406 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.362 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.822 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.583 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.380 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.357 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.895 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.353 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.400 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.508 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.780 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.433 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.559 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.439 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.336 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.564 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.563 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.546 ms
+sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced
+sage_attention-torch-ext>     half *sO = (half*)smem_;
+sage_attention-torch-ext>           ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 254.401 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 258.197 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 292.728 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 288.496 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 259.609 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 257.610 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 310.313 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 261.558 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 305.399 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 290.890 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 321.909 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 268.549 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 305.193 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 323.257 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 306.249 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 328.274 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 198.513 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 195.701 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 203.394 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 206.831 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 188.040 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 177.604 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 134.328 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.982 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 112.759 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 112.951 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 117.419 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 119.115 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 113.767 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 113.382 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 117.939 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 118.428 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 161.385 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 162.059 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 163.835 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 165.521 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 159.573 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 159.492 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 163.714 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 164.815 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 171.328 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 166.435 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 172.348 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 172.724 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 168.493 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 166.818 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 173.286 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 173.009 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 105.165 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 107.053 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 112.071 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.342 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 106.804 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 106.230 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 111.573 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.108 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 113.093 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 113.517 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 117.870 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 118.980 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 114.139 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 113.256 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 116.641 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 115.081 ms
+sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 964.900 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 958.181 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 958.371 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 958.394 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 939.503 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 701.023 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 573.680 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 801.568 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 813.002 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 808.011 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1016.111 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 977.034 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 754.749 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 731.276 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 923.620 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 940.844 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 459.245 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 457.206 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 451.179 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 450.787 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 447.327 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 431.471 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 506.880 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 527.163 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 553.846 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 551.678 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.979 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 495.100 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 483.901 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 477.401 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 496.705 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 515.510 ms
+sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 966.091 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 961.337 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 972.006 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 974.650 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 957.506 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 960.363 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 971.507 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 970.596 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 807.611 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 802.233 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1033.905 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1024.056 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 810.552 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 796.703 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1004.351 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1012.638 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 510.266 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 512.618 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 521.439 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 524.050 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 512.244 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 508.562 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 529.261 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 519.651 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 547.796 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 543.208 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 560.784 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 556.368 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 546.293 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 550.874 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 558.939 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 549.771 ms
+sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 551.911 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 541.946 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 553.795 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 561.058 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 548.291 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 554.796 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 562.109 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 552.458 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 463.777 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 463.453 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 583.598 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 589.296 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 465.519 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 466.966 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 594.830 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 589.614 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 304.504 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 292.156 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.004 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 262.724 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 304.503 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 297.500 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.810 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 256.907 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 539.805 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 540.085 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 547.628 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 539.854 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 538.118 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 518.144 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 519.476 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 509.056 ms
+sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced
+sage_attention-torch-ext>     half *sO = (half*)smem_;
+sage_attention-torch-ext>           ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 250.137 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 245.084 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 267.574 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 296.208 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 260.850 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 249.064 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 270.699 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 286.529 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 272.624 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 307.901 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 280.865 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 303.936 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 258.298 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 294.248 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 323.737 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 295.934 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 198.053 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 192.406 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 198.137 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 196.772 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 191.623 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 191.442 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 179.258 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 160.342 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 109.085 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 109.712 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 115.319 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 115.636 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.774 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.079 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 113.757 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 113.820 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 154.069 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 153.781 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 157.918 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 158.807 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 153.605 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 152.656 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 157.839 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 158.870 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 160.435 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 160.455 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 165.937 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 165.331 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 161.712 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 160.096 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 164.483 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 166.942 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.455 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.423 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 105.992 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 106.119 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.173 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.529 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 107.337 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 106.419 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 108.601 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 109.895 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 120.216 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 114.583 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.804 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 108.320 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 111.462 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.934 ms
+sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 589.821 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 586.041 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 491.060 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 494.720 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 773.047 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 965.297 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 969.303 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 953.702 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 799.895 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 791.529 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 980.682 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 968.691 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 796.989 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 785.876 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 925.652 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 925.953 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 387.605 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 400.010 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 415.420 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 415.903 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 405.187 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 402.514 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 413.632 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 364.884 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 396.414 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 379.967 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 395.708 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 397.412 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 403.622 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 419.375 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 421.370 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 399.024 ms
+sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 788.068 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 796.846 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 982.791 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 992.436 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 967.039 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 955.511 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 948.729 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 947.350 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 804.728 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 688.548 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 604.303 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 591.514 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 478.180 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 474.254 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 585.374 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 575.116 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.784 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 233.797 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 237.642 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 238.282 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.575 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 229.730 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.005 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.815 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.679 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.601 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.298 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.797 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.296 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 286.895 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 373.298 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 431.989 ms
+sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 610.185 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 602.678 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 505.530 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 514.490 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 580.128 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 578.117 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 578.034 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 579.730 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 504.559 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 508.363 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 639.247 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 638.366 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 794.307 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 870.622 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1100.127 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1076.237 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 392.309 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 410.184 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 419.790 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 423.421 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 409.855 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 405.905 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 415.085 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 417.644 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 420.401 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 410.354 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 458.834 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 460.341 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 454.346 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 453.572 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 464.660 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 450.779 ms
+sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1026.913 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1039.187 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1035.649 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1034.679 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 943.433 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 603.724 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 608.512 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 615.357 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 592.052 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 855.749 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1111.245 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1130.304 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 884.960 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 906.424 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1103.580 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1094.002 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 401.149 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 391.095 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 442.479 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 440.891 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 428.954 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 437.536 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 459.124 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 442.354 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 462.155 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 471.281 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 489.585 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 469.265 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 451.902 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 457.616 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 473.635 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 469.484 ms
+sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 984.592 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 990.417 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1003.978 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 991.593 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 973.002 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 963.093 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 974.741 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 970.046 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 829.686 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 778.324 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 814.682 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 994.113 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 826.298 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 831.656 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 996.818 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 1000.920 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 406.501 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 410.054 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 419.738 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 419.590 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 381.081 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 371.843 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 422.365 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 424.444 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 460.547 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 454.437 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 464.828 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 448.633 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 452.369 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 477.124 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 455.888 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 448.456 ms
+sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced
+sage_attention-torch-ext>     int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads;
+sage_attention-torch-ext>                         ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced
+sage_attention-torch-ext>     half *sO = (half*)smem_;
+sage_attention-torch-ext>           ^
+sage_attention-torch-ext> 
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 164.799 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 162.299 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 167.071 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 167.305 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 163.529 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 162.413 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 167.845 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 166.829 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 170.956 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 171.643 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 171.202 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 169.275 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 162.832 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 163.102 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 167.108 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 167.067 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.579 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.599 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 106.308 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 106.308 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 101.930 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 101.802 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 105.487 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 106.022 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 107.688 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 108.189 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 112.518 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 112.481 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 108.660 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 108.254 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 112.380 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 111.712 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 151.116 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 150.731 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 155.635 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 156.071 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 150.923 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 151.125 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 155.642 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 155.914 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 158.104 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 159.048 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 163.044 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 163.951 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 159.584 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 167 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 160.201 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 160.848 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 160.423 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 100.711 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 100.134 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.477 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.970 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 99.158 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 100.406 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 103.529 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 102.484 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 105.449 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 105.326 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 110.221 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 108.746 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 105.144 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 105.226 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 109.365 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 109.624 ms
+sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 428.455 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 429.635 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 519.790 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 516.317 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 507.478 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 514.600 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 509.573 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 505.110 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 446.349 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 448.854 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 546.233 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 540.883 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 446.249 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 448.848 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 543.742 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 540.598 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 222.762 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 223.708 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 228.874 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 230.260 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.065 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.723 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.020 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.397 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 240.851 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 240.220 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 247.410 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.724 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.628 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.573 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.137 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 244.431 ms
+sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 448.461 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 445.120 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 531.424 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 528.424 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 530.435 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 528.854 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 530.439 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 532.033 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 462.295 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 464.516 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 569.408 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.447 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 462.865 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 463.457 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 560.896 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 561.562 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 234.511 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 232.415 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 238.152 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 240.639 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 233.681 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 231.057 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.930 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 233.246 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 247.219 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.739 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 252.486 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.085 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 246.876 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 247.286 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 252.920 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.212 ms
+sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 556.352 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 553.921 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 467.748 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 468.780 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 530.531 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 533.354 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 536.300 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 537.186 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 473.500 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 479.248 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 593.492 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 591.683 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 496.066 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 493.646 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 592.271 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 592.168 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 233.115 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.961 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 234.252 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.465 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 229.104 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.404 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 233.070 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.435 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.171 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 249.628 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 256.891 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 256.335 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.943 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.335 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.577 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 255.777 ms
+sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 548.097 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 531.494 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 541.595 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 541.311 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 543.318 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 540.357 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 540.372 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 540.675 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 465.859 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 457.031 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 573.905 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 573.667 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 460.213 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 460.097 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.269 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.915 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 284.879 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 283.571 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.400 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.976 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 287.723 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 285.951 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 251.519 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.311 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 309.154 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 307.970 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 319.757 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 317.137 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 308.197 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 307.759 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 312.691 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 312.052 ms
+sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.453 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.150 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 574.691 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 577.777 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 555.916 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 554.288 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 564.822 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 565.896 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 519.489 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 509.876 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 608.774 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 610.988 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 496.281 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 497.232 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 598.825 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 597.024 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.767 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 240.665 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 246.634 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 246.912 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.834 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 240.000 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.898 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 244.002 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.691 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.304 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 264.469 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 264.533 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.612 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.095 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.757 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 266.410 ms
+sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 573.052 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 569.894 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 575.749 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 576.139 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 565.428 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 565.432 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 570.960 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 567.129 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 479.405 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 481.992 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 599.426 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 600.968 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 487.144 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 482.863 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 597.829 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 599.207 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 304.359 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 303.989 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 310.315 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 310.421 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 302.690 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 300.923 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 310.178 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 312.938 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 328.845 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 327.077 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 333.822 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 333.667 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 327.900 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 327.791 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 334.298 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 333.854 ms
+sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 582.117 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 544.992 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 584.988 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 555.232 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 454.499 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 439.407 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 572.154 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 542.048 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 585.747 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 559.584 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 597.075 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.451 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 588.464 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 558.150 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 596.007 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 567.032 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 243.884 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.015 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.009 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 256.923 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 258.981 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.514 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.371 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 256.704 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 266.321 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 255.617 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 284.004 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 275.143 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 275.742 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 267.119 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 284.730 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 276.077 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 197.321 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 197.197 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 206.997 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 213.504 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 217.085 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 217.001 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 222.448 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 216.447 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 222.938 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 220.112 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 225.064 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 225.100 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.324 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 220.230 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 224.575 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 224.262 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.648 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.222 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 272.418 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 271.543 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 244.082 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.818 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 270.600 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 271.513 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 285.432 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 284.214 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 290.738 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 290.681 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 281.864 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 289.314 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 298.218 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 297.861 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 547.650 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 515.155 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 548.305 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 522.745 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 426.900 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 405.909 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 539.708 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 512.176 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 555.106 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 529.515 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 566.544 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 539.557 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 554.977 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 526.419 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 565.716 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 537.362 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.289 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 234.398 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.279 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.625 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.500 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 234.725 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 247.759 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.854 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 261.146 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 251.628 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 267.309 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 258.554 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.758 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 252.118 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 267.376 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.264 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 477.204 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 471.498 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 485.235 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 491.403 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 469.169 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 472.946 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 472.230 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 471.596 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 487.877 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 488.877 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 499.892 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 500.543 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 490.862 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 488.826 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 493.915 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 493.916 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.825 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 247.184 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.799 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.603 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.420 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.985 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.939 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.712 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 264.742 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.494 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 272.510 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 273.021 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 264.256 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.618 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 271.779 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 273.684 ms
+sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> buildPhase completed in 3 minutes 37 seconds
+sage_attention-torch-ext> Running phase: installPhase
+sage_attention-torch-ext> install flags: -j21 install
+sage_attention-torch-ext> [0/1] Install the project...
+sage_attention-torch-ext> -- Install configuration: "Release"
+sage_attention-torch-ext> -- Installing: /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> Running phase: fixupPhase
+sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext
+sage_attention-torch-ext> shrinking /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> checking for references to /build/ in /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext...
+sage_attention-torch-ext> patching script interpreter paths in /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext
+sage_attention-torch-ext> Running phase: installCheckPhase
+sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing
+sage_attention-torch-ext> Checking of ABI compatibility
+sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9
+sage_attention-torch-ext> ✅ No compatibility issues found
+sage_attention-torch-ext> Checking loading kernel with get_kernel
+sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention
+sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> buildPhase completed in 3 minutes 42 seconds
+sage_attention-torch-ext> Running phase: installPhase
+sage_attention-torch-ext> install flags: -j21 install
+sage_attention-torch-ext> [0/1] Install the project...
+sage_attention-torch-ext> -- Install configuration: "Release"
+sage_attention-torch-ext> -- Installing: /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> Running phase: fixupPhase
+sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext
+sage_attention-torch-ext> shrinking /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> checking for references to /build/ in /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext...
+sage_attention-torch-ext> patching script interpreter paths in /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext
+sage_attention-torch-ext> Running phase: installCheckPhase
+sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing
+sage_attention-torch-ext> Checking of ABI compatibility
+sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9
+sage_attention-torch-ext> ✅ No compatibility issues found
+sage_attention-torch-ext> Checking loading kernel with get_kernel
+sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention
+sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 11 bytes gmem, 88 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 584.776 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 546.615 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 588.242 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 557.461 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 457.510 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 442.285 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 574.289 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 545.839 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 590.187 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 564.103 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 601.630 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 571.288 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 590.861 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 558.525 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 597.586 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 568.195 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.563 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 237.397 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 267.101 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.641 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.210 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 249.585 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 270.182 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.223 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 268.912 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.454 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 285.952 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 276.613 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 277.856 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 268.776 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 286.969 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 277.383 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 198.436 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 197.677 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 208.144 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 215.222 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 217.948 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 217.815 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 224.268 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 217.566 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 222.772 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 222.426 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.758 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 227.240 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.637 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.849 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.190 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 226.639 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 266.983 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 267.092 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 273.940 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 273.960 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 246.620 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 247.488 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 274.226 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 274.002 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 286.593 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 286.454 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 293.260 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 293.760 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 284.209 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 293.118 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 300.652 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 300.845 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 549.938 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 517.048 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 550.878 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 524.674 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 430.526 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 409.659 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 542.771 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 515.247 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 559.112 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 533.754 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 571.632 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 545.216 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 559.200 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 533.626 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 571.547 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 544.504 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.386 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 238.228 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.757 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 243.618 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 246.223 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 237.605 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 252.583 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.237 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 263.119 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.398 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 270.412 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 261.949 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 263.559 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.463 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 271.281 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 263.448 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 483.972 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 478.741 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 492.247 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 499.507 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 478.163 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 480.421 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 478.013 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 479.704 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 492.647 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 496.343 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 506.130 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 505.690 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 499.671 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 496.812 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 507.126 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 505.885 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.743 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.403 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 258.148 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.065 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.576 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 249.934 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.861 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.579 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 269.519 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 269.729 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 277.652 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 277.765 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 268.882 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 269.865 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 276.602 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 276.939 ms
+sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> buildPhase completed in 3 minutes 45 seconds
+sage_attention-torch-ext> Running phase: installPhase
+sage_attention-torch-ext> install flags: -j21 install
+sage_attention-torch-ext> [0/1] Install the project...
+sage_attention-torch-ext> -- Install configuration: "Release"
+sage_attention-torch-ext> -- Installing: /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> Running phase: fixupPhase
+sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext
+sage_attention-torch-ext> shrinking /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> checking for references to /build/ in /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext...
+sage_attention-torch-ext> patching script interpreter paths in /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext
+sage_attention-torch-ext> Running phase: installCheckPhase
+sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing
+sage_attention-torch-ext> Checking of ABI compatibility
+sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9
+sage_attention-torch-ext> ✅ No compatibility issues found
+sage_attention-torch-ext> Checking loading kernel with get_kernel
+sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention
+sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 10 bytes gmem, 80 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 540.842 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 537.877 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 550.018 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 548.929 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 536.450 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 537.209 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 534.340 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 534.521 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 471.004 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 472.920 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 573.035 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 571.750 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 469.703 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 473.748 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 571.073 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 567.513 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.378 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.776 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.118 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.920 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.965 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 234.354 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.999 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.139 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.654 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 255.864 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.515 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.248 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 255.341 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.405 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.494 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.957 ms
+sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> buildPhase completed in 3 minutes 49 seconds
+sage_attention-torch-ext> Running phase: installPhase
+sage_attention-torch-ext> install flags: -j21 install
+sage_attention-torch-ext> [0/1] Install the project...
+sage_attention-torch-ext> -- Install configuration: "Release"
+sage_attention-torch-ext> -- Installing: /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> Running phase: fixupPhase
+sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext
+sage_attention-torch-ext> shrinking /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> checking for references to /build/ in /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext...
+sage_attention-torch-ext> patching script interpreter paths in /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext
+sage_attention-torch-ext> Running phase: installCheckPhase
+sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing
+sage_attention-torch-ext> Checking of ABI compatibility
+sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9
+sage_attention-torch-ext> ✅ No compatibility issues found
+sage_attention-torch-ext> Checking loading kernel with get_kernel
+sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention
+sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 33.289 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 29.922 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 25.506 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 24.229 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.507 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.402 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.284 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 4.249 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 4.153 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 4.124 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 3.441 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers
+sage_attention-torch-ext> ptxas info    : Compile time = 3.396 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.558 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.568 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.634 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.541 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.451 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.412 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.383 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.331 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 12.259 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.726 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.702 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.676 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.785 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.582 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.636 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 29 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.604 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.001 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.682 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.202 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.571 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 10.582 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.640 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.583 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.689 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.382 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.123 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.179 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.029 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 11.805 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.019 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 8.021 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 28 registers, used 1 barriers, 132 bytes smem
+sage_attention-torch-ext> ptxas info    : Compile time = 7.987 ms
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 22.199 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 20.236 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 15.886 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 15.927 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.053 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.023 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.037 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.014 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.934 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.911 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.539 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 20 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.515 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.825 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.758 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.799 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.780 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.809 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.821 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.860 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.802 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.542 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.751 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.764 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.734 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.465 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.805 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.770 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.803 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.826 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.861 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.836 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.775 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.283 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.797 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.852 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.792 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.622 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.937 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.951 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.403 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.823 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.956 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.932 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.920 ms
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 29.989 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 27.743 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 21.940 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 22.046 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.158 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.085 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.102 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 3.056 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.995 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 16 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.998 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.554 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 18 registers, used 0 barriers, 412 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 2.556 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.914 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.864 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.853 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.860 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.920 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.835 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.938 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.867 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.520 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.801 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.806 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.807 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 10.550 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.905 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.925 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 8.436 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.396 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.844 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.907 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.863 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.293 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.874 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.941 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.908 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.847 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.972 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.040 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.524 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 9.966 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 6.990 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.034 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 7.001 ms
+sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o
+sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used
+sage_attention-torch-ext> ptxas info    : 28 bytes gmem, 224 bytes cmem[4]
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 545.701 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 516.375 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 549.037 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 527.582 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 465.316 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 449.153 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 472.189 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 447.628 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 487.618 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 465.347 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 567.223 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 547.253 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 492.125 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 465.637 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 497.336 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 474.373 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.373 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 223.327 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 249.079 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.504 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.967 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.799 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 258.238 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 248.402 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.456 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 250.567 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 278.934 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 269.049 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 270.213 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.206 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 277.538 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 269.246 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 199.821 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 185 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 199.606 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 209.401 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 210.623 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 211.729 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 176 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 211.557 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 210.827 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 209.703 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 216.049 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 180 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 216.295 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 221.361 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 222.072 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 216.433 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 177 registers, used 1 barriers, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 216.431 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 219.990 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0]
+sage_attention-torch-ext> ptxas info    : Compile time = 220.584 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 258.976 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 257.879 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.195 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 265.276 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.031 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.054 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 263.473 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 262.351 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 275.256 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 273.759 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 281.644 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 282.248 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 274.738 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 274.730 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 282.169 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 282.613 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 445.454 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 429.332 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 530.361 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 435.991 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 444.905 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 421.719 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 512.094 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 488.322 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 461.929 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 440.182 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 472.256 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 449.929 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 460.100 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 437.509 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 471.040 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 449.202 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 237.683 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 228.527 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.745 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 232.871 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.932 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 227.876 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.745 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 234.589 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.400 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.344 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.897 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 252.860 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.826 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 245.663 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 261.039 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 251.908 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 460.584 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 458.619 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 475.564 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 476.445 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 461.818 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 463.274 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 464.385 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 466.754 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 481.417 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 478.971 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 488.346 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 489.099 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 478.063 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 477.882 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 487.670 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 485.269 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 239.050 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 236.372 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 241.725 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.865 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.112 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 235.454 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.402 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 242.226 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 254.271 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.270 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.606 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.949 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.865 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 253.525 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 260.816 ms
+sage_attention-torch-ext> ptxas info    : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80'
+sage_attention-torch-ext> ptxas info    : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf
+sage_attention-torch-ext>     32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
+sage_attention-torch-ext> ptxas info    : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2]
+sage_attention-torch-ext> ptxas info    : Compile time = 259.752 ms
+sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> buildPhase completed in 5 minutes 7 seconds
+sage_attention-torch-ext> Running phase: installPhase
+sage_attention-torch-ext> install flags: -j21 install
+sage_attention-torch-ext> [0/1] Install the project...
+sage_attention-torch-ext> -- Install configuration: "Release"
+sage_attention-torch-ext> -- Installing: /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> Running phase: fixupPhase
+sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext
+sage_attention-torch-ext> shrinking /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so
+sage_attention-torch-ext> checking for references to /build/ in /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext...
+sage_attention-torch-ext> patching script interpreter paths in /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext
+sage_attention-torch-ext> Running phase: installCheckPhase
+sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing
+sage_attention-torch-ext> Checking of ABI compatibility
+sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9
+sage_attention-torch-ext> ✅ No compatibility issues found
+sage_attention-torch-ext> Checking loading kernel with get_kernel
+sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention
+building '/nix/store/xq28asxbqp6g7x8bcz92xl849prg2899-torch-ext-bundle.drv'...
+building '/nix/store/rkzh9xwk6kdgl1by4xfwmyvb5arpfqby-build-and-copy.drv'...
diff --git a/sage_attention/cuda_tensormap_shim.cuh b/sage_attention/cuda_tensormap_shim.cuh
index 2b8dc2ff500a7486611c15d9ede13333854235d6..cadddb6646e4c38e76e1fe0cfefbf8e17604b7ed 100644
--- a/sage_attention/cuda_tensormap_shim.cuh
+++ b/sage_attention/cuda_tensormap_shim.cuh
@@ -1,61 +1,46 @@
-/*
- * Lightweight compatibility shim for CUDA tensor map APIs.
- * Provides fallbacks for CUtensorMap and related enums when compiling
- * against CUDA toolkits that don't expose these symbols in headers.
- */
-
 #pragma once
-
 #include <cuda.h>
 
-// Guard on CUDA version and symbol presence. Some environments have
-// runtime symbols but not headers; we define minimal stand-ins.
+// Provide fallbacks only if CUDA headers don’t define tensor map
+#if !defined(CU_TENSOR_MAP_NUM_QWORDS)
 
-#ifndef CU_TENSOR_MAP_L2_PROMOTION_NONE
-typedef enum CUtensorMapL2promotion_enum {
-    CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
-    CU_TENSOR_MAP_L2_PROMOTION_L2_64B = 1,
-    CU_TENSOR_MAP_L2_PROMOTION_L2_128B = 2
-} CUtensorMapL2promotion_enum;
+// Layout-compatible stand-in
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+struct alignas(64) CUtensorMap_st { unsigned long long opaque[16]; };
+#else
+struct CUtensorMap_st { unsigned long long opaque[16]; };
 #endif
-
-#ifndef CUtensorMap
-typedef struct CUtensorMap_st {
-    unsigned long long data[16];
-} CUtensorMap;
-#endif
-
-#ifndef CU_TENSOR_MAP_DATA_TYPE_UINT8
-typedef enum CUtensorMapDataType {
-    CU_TENSOR_MAP_DATA_TYPE_UINT8 = 1,
-    CU_TENSOR_MAP_DATA_TYPE_INT8 = 2,
-    CU_TENSOR_MAP_DATA_TYPE_FLOAT16 = 10,
-    CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 = 13
+typedef CUtensorMap_st CUtensorMap;
+
+// Minimal enums used by create_tensor_map_4D
+typedef enum CUtensorMapDataType_enum {
+  CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,
+  CU_TENSOR_MAP_DATA_TYPE_FLOAT16 = 6,
+  CU_TENSOR_MAP_DATA_TYPE_FLOAT32 = 7,
+  CU_TENSOR_MAP_DATA_TYPE_FLOAT64 = 8,
+  CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 = 10
 } CUtensorMapDataType;
-#endif
 
-#ifndef CU_TENSOR_MAP_INTERLEAVE_NONE
 typedef enum CUtensorMapInterleave_enum {
-    CU_TENSOR_MAP_INTERLEAVE_NONE = 0
-} CUtensorMapInterleave_enum;
-#endif
+  CU_TENSOR_MAP_INTERLEAVE_NONE = 0
+} CUtensorMapInterleave;
 
-#ifndef CU_TENSOR_MAP_SWIZZLE_32B
 typedef enum CUtensorMapSwizzle_enum {
-    CU_TENSOR_MAP_SWIZZLE_NONE = 0,
-    CU_TENSOR_MAP_SWIZZLE_32B = 1,
-    CU_TENSOR_MAP_SWIZZLE_64B = 2,
-    CU_TENSOR_MAP_SWIZZLE_128B = 3
-} CUtensorMapSwizzle_enum;
-#endif
+  CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+  CU_TENSOR_MAP_SWIZZLE_32B,
+  CU_TENSOR_MAP_SWIZZLE_64B,
+  CU_TENSOR_MAP_SWIZZLE_128B
+} CUtensorMapSwizzle;
 
-#ifndef CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
-typedef enum CUtensorMapFloatOOBfill_enum {
-    CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0
-} CUtensorMapFloatOOBfill_enum;
-#endif
-
-// We intentionally do not declare cuTensorMapEncodeTiled here; the code
-// dynamically resolves it from libcuda at runtime when available.
+typedef enum CUtensorMapL2promotion_enum {
+  CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+  CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+  CU_TENSOR_MAP_L2_PROMOTION_L2_128B
+} CUtensorMapL2promotion;
 
+typedef enum CUtensorMapFloatOOBfill_enum {
+  CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0
+} CUtensorMapFloatOOBfill;
 
+#endif  // !defined(CU_TENSOR_MAP_NUM_QWORDS)
+// no declaration of cuTensorMapEncodeTiled here; it’s resolved at runtime
\ No newline at end of file
diff --git a/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh b/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh
index d829aa44b90259cc9b1a647e20f6056d41f93a9f..be88db5b9639eafe7163e3091bd343afec489167 100644
--- a/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh
+++ b/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh
@@ -15,7 +15,7 @@
  */
 
 #include "../utils.cuh"
-#include <cuda_fp16.h>
+// #include <cuda_fp16.h>
 #include <cuda_pipeline_primitives.h>
 #include <torch/torch.h>
 
diff --git a/torch-ext/sage_attention/_ops.py b/torch-ext/sage_attention/_ops.py
deleted file mode 100644
index 948e49e1434a82a7b94b2d78caf7d588365d9e72..0000000000000000000000000000000000000000
--- a/torch-ext/sage_attention/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _sage_attention_57cb7ec_dirty
-ops = torch.ops._sage_attention_57cb7ec_dirty
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_sage_attention_57cb7ec_dirty::{op_name}"
\ No newline at end of file
diff --git a/torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so b/torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so
deleted file mode 100755
index f14c4d32d36a2eccc732740e76a085337076546b..0000000000000000000000000000000000000000
--- a/torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:afa4831d0d218167c818a3871cf9fc01f154a6fc3c4671efdfede77a83e3b083
-size 26036368
diff --git a/torch-ext/torch_binding.cpp b/torch-ext/torch_binding.cpp
index 54b6d54261acf81d499d93fad0e814887d6a337a..6bf79b2239679f5258844f7f51be55553d340c1c 100644
--- a/torch-ext/torch_binding.cpp
+++ b/torch-ext/torch_binding.cpp
@@ -2,260 +2,7 @@
 
 #include "registration.h"
 #include "torch_binding.h"
-#include <ATen/cuda/CUDAContext.h>
 
-void sm_check_89(torch::Tensor x, std::string op_name) {
-  int device_index = x.get_device();
-  const auto& prop = at::cuda::getDeviceProperties(device_index);
-
-  std::cerr << "sm_check_89: prop->major: " << prop->major << std::endl;
-  std::cerr << "sm_check_89: prop->minor: " << prop->minor << std::endl;
-
-  if (prop->major < 8 || (prop->major == 8 && prop->minor < 9)) {
-      TORCH_CHECK(false, op_name + " requires compute capability 8.9+");
-  }
-}
-
-void sm_check_90(torch::Tensor x, std::string op_name) {
-  int device_index = x.get_device();
-  const auto& prop = at::cuda::getDeviceProperties(device_index);
-
-  std::cerr << "sm_check_90: prop->major: " << prop->major << std::endl;
-  std::cerr << "sm_check_90: prop->minor: " << prop->minor << std::endl;
-
-  if (prop->major < 9) {
-      TORCH_CHECK(false, op_name + " requires compute capability 9.0+");
-  }
-}
-
-void sm_check_80(torch::Tensor x, std::string op_name) {
-  int device_index = x.get_device();
-  const auto& prop = at::cuda::getDeviceProperties(device_index);
-  std::cerr << "sm_check_80: prop->major: " << prop->major << std::endl;
-  std::cerr << "sm_check_80: prop->minor: " << prop->minor << std::endl;
-  if (prop->major < 8) {
-      TORCH_CHECK(false, op_name + " requires compute capability 8.0+");
-  }
-}
-
-// ##############################################################################
-// SM89
-// ##############################################################################
-static at::Tensor qk_int8_sv_f8_accum_f32_attn_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn");
-  return qk_int8_sv_f8_accum_f32_attn(
-      q, k, v, o, q_scale, k_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn");
-  return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
-      q, k, v, o, q_scale, k_scale, v_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, at::Tensor v_mean,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn");
-  return qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
-      q, k, v, o, q_scale, k_scale, v_scale, v_mean,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf");
-  return qk_int8_sv_f8_accum_f32_attn_inst_buf(
-      q, k, v, o, q_scale, k_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f8_accum_f16_attn_inst_buf_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_89(q, "qk_int8_sv_f8_accum_f16_attn_inst_buf");
-  return qk_int8_sv_f8_accum_f16_attn_inst_buf(
-      q, k, v, o, q_scale, k_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf");
-  return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
-      q, k, v, o, q_scale, k_scale, v_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_89(q, "qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf");
-  return qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
-      q, k, v, o, q_scale, k_scale, v_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-
-// ##############################################################################
-// SM90
-// ##############################################################################
-
-static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90_wrap(
-  at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-  at::Tensor q_scale, at::Tensor k_scale,
-  int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-  double sm_scale, int64_t return_lse) {
-  sm_check_90(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90");
-return qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90(
-    q, k, v, o, q_scale, k_scale,
-    static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-    static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90_wrap(
-  at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-  at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
-  int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-  double sm_scale, int64_t return_lse) {
-  sm_check_90(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90");
-return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
-    q, k, v, o, q_scale, k_scale, v_scale,
-    static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-    static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-// ##############################################################################
-// SM80
-// ##############################################################################
-static at::Tensor qk_int8_sv_f16_accum_f32_attn_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_80(q, "qk_int8_sv_f16_accum_f32_attn");
-  return qk_int8_sv_f16_accum_f32_attn(
-      q, k, v, o, q_scale, k_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f16_accum_f16_attn_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn");
-  return qk_int8_sv_f16_accum_f16_attn(
-      q, k, v, o, q_scale, k_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f16_accum_f16_attn_inst_buf_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn_inst_buf");
-  return qk_int8_sv_f16_accum_f16_attn_inst_buf(
-      q, k, v, o, q_scale, k_scale,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-static at::Tensor qk_int8_sv_f16_accum_f16_fuse_v_mean_attn_wrap(
-    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
-    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_mean,
-    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
-    double sm_scale, int64_t return_lse) {
-  sm_check_80(q, "qk_int8_sv_f16_accum_f16_fuse_v_mean_attn");
-  return qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
-      q, k, v, o, q_scale, k_scale, v_mean,
-      static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
-      static_cast<float>(sm_scale), static_cast<int>(return_lse));
-}
-
-// Fused
-static void quant_per_block_int8_cuda_wrap(
-    at::Tensor input, at::Tensor output, at::Tensor scale,
-    double sm_scale, int64_t block_size, int64_t tensor_layout) {
-  quant_per_block_int8_cuda(
-      input, output, scale,
-      static_cast<float>(sm_scale), static_cast<int>(block_size), static_cast<int>(tensor_layout));
-}
-
-static void quant_per_block_int8_fuse_sub_mean_cuda_wrap(
-    at::Tensor input, at::Tensor mean, at::Tensor output, at::Tensor scale,
-    int64_t block_size, int64_t tensor_layout) {
-  quant_per_block_int8_fuse_sub_mean_cuda(
-      input, mean, output, scale,
-      static_cast<int>(block_size), static_cast<int>(tensor_layout));
-}
-
-static void quant_per_warp_int8_cuda_wrap(
-    at::Tensor input, at::Tensor output, at::Tensor scale,
-    int64_t block_size, int64_t warp_block_size, int64_t tensor_layout) {
-  quant_per_warp_int8_cuda(
-      input, output, scale,
-      static_cast<int>(block_size), static_cast<int>(warp_block_size), static_cast<int>(tensor_layout));
-}
-
-static void sub_mean_cuda_wrap(
-    at::Tensor input, at::Tensor mean, at::Tensor output,
-    int64_t tensor_layout) {
-  sub_mean_cuda(input, mean, output, static_cast<int>(tensor_layout));
-}
-
-static void transpose_pad_permute_cuda_wrap(
-    at::Tensor input, at::Tensor output, int64_t tensor_layout) {
-  transpose_pad_permute_cuda(input, output, static_cast<int>(tensor_layout));
-}
-
-static void scale_fuse_quant_cuda_wrap(
-    at::Tensor input, at::Tensor output, at::Tensor scale,
-    int64_t num_tokens, double scale_max, int64_t tensor_layout) {
-  scale_fuse_quant_cuda(
-      input, output, scale,
-      static_cast<int>(num_tokens), static_cast<float>(scale_max), static_cast<int>(tensor_layout));
-}
-
-static void mean_scale_fuse_quant_cuda_wrap(
-    at::Tensor input, at::Tensor output, at::Tensor mean, at::Tensor scale,
-    int64_t num_tokens, double scale_max, int64_t tensor_layout) {
-  mean_scale_fuse_quant_cuda(
-      input, output, mean, scale,
-      static_cast<int>(num_tokens), static_cast<float>(scale_max), static_cast<int>(tensor_layout));
-}
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
     // SM90
diff --git a/torch-ext/torch_binding.h b/torch-ext/torch_binding.h
index 4be1bb053aecd753a96f93f3f3f2cf95c9095c84..46bed40d0868d47b6fb896149e760b61e9ba2ec9 100644
--- a/torch-ext/torch_binding.h
+++ b/torch-ext/torch_binding.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/torch.h>
+#include <ATen/cuda/CUDAContext.h>
 
 // SM80
 torch::Tensor qk_int8_sv_f16_accum_f32_attn(torch::Tensor query,
@@ -218,4 +219,261 @@ void mean_scale_fuse_quant_cuda(
     torch::Tensor scale,
     int num_tokens,
     float scale_max,
-    int tensor_layout);
\ No newline at end of file
+    int tensor_layout);
+
+
+
+void sm_check_89(torch::Tensor x, std::string op_name) {
+    int device_index = x.get_device();
+    const auto& prop = at::cuda::getDeviceProperties(device_index);
+
+    std::cerr << "sm_check_89: prop->major: " << prop->major << std::endl;
+    std::cerr << "sm_check_89: prop->minor: " << prop->minor << std::endl;
+
+    if (prop->major < 8 || (prop->major == 8 && prop->minor < 9)) {
+        TORCH_CHECK(false, op_name + " requires compute capability 8.9+");
+    }
+}
+
+void sm_check_90(torch::Tensor x, std::string op_name) {
+    int device_index = x.get_device();
+    const auto& prop = at::cuda::getDeviceProperties(device_index);
+
+    std::cerr << "sm_check_90: prop->major: " << prop->major << std::endl;
+    std::cerr << "sm_check_90: prop->minor: " << prop->minor << std::endl;
+
+    if (prop->major < 9) {
+        TORCH_CHECK(false, op_name + " requires compute capability 9.0+");
+    }
+}
+
+void sm_check_80(torch::Tensor x, std::string op_name) {
+    int device_index = x.get_device();
+    const auto& prop = at::cuda::getDeviceProperties(device_index);
+    std::cerr << "sm_check_80: prop->major: " << prop->major << std::endl;
+    std::cerr << "sm_check_80: prop->minor: " << prop->minor << std::endl;
+    if (prop->major < 8) {
+        TORCH_CHECK(false, op_name + " requires compute capability 8.0+");
+    }
+}
+
+// ##############################################################################
+// SM89
+// ##############################################################################
+static at::Tensor qk_int8_sv_f8_accum_f32_attn_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn");
+    return qk_int8_sv_f8_accum_f32_attn(
+        q, k, v, o, q_scale, k_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn");
+    return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+        q, k, v, o, q_scale, k_scale, v_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, at::Tensor v_mean,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn");
+    return qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+        q, k, v, o, q_scale, k_scale, v_scale, v_mean,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf");
+    return qk_int8_sv_f8_accum_f32_attn_inst_buf(
+        q, k, v, o, q_scale, k_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f8_accum_f16_attn_inst_buf_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_89(q, "qk_int8_sv_f8_accum_f16_attn_inst_buf");
+    return qk_int8_sv_f8_accum_f16_attn_inst_buf(
+        q, k, v, o, q_scale, k_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf");
+    return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+        q, k, v, o, q_scale, k_scale, v_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_89(q, "qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf");
+    return qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+        q, k, v, o, q_scale, k_scale, v_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+
+// ##############################################################################
+// SM90
+// ##############################################################################
+
+static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_90(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90");
+return qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90(
+    q, k, v, o, q_scale, k_scale,
+    static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+    static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_90(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90");
+return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+    q, k, v, o, q_scale, k_scale, v_scale,
+    static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+    static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+// ##############################################################################
+// SM80
+// ##############################################################################
+static at::Tensor qk_int8_sv_f16_accum_f32_attn_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_80(q, "qk_int8_sv_f16_accum_f32_attn");
+    return qk_int8_sv_f16_accum_f32_attn(
+        q, k, v, o, q_scale, k_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f16_accum_f16_attn_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn");
+    return qk_int8_sv_f16_accum_f16_attn(
+        q, k, v, o, q_scale, k_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f16_accum_f16_attn_inst_buf_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn_inst_buf");
+    return qk_int8_sv_f16_accum_f16_attn_inst_buf(
+        q, k, v, o, q_scale, k_scale,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+static at::Tensor qk_int8_sv_f16_accum_f16_fuse_v_mean_attn_wrap(
+    at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o,
+    at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_mean,
+    int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
+    double sm_scale, int64_t return_lse) {
+    sm_check_80(q, "qk_int8_sv_f16_accum_f16_fuse_v_mean_attn");
+    return qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+        q, k, v, o, q_scale, k_scale, v_mean,
+        static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
+        static_cast<float>(sm_scale), static_cast<int>(return_lse));
+}
+
+// ##############################################################################
+// Fused
+// ##############################################################################
+static void quant_per_block_int8_cuda_wrap(
+    at::Tensor input, at::Tensor output, at::Tensor scale,
+    double sm_scale, int64_t block_size, int64_t tensor_layout) {
+    quant_per_block_int8_cuda(
+        input, output, scale,
+        static_cast<float>(sm_scale), static_cast<int>(block_size), static_cast<int>(tensor_layout));
+}
+
+static void quant_per_block_int8_fuse_sub_mean_cuda_wrap(
+    at::Tensor input, at::Tensor mean, at::Tensor output, at::Tensor scale,
+    int64_t block_size, int64_t tensor_layout) {
+    quant_per_block_int8_fuse_sub_mean_cuda(
+        input, mean, output, scale,
+        static_cast<int>(block_size), static_cast<int>(tensor_layout));
+}
+
+static void quant_per_warp_int8_cuda_wrap(
+    at::Tensor input, at::Tensor output, at::Tensor scale,
+    int64_t block_size, int64_t warp_block_size, int64_t tensor_layout) {
+    quant_per_warp_int8_cuda(
+        input, output, scale,
+        static_cast<int>(block_size), static_cast<int>(warp_block_size), static_cast<int>(tensor_layout));
+}
+
+static void sub_mean_cuda_wrap(
+    at::Tensor input, at::Tensor mean, at::Tensor output,
+    int64_t tensor_layout) {
+    sub_mean_cuda(input, mean, output, static_cast<int>(tensor_layout));
+}
+
+static void transpose_pad_permute_cuda_wrap(
+    at::Tensor input, at::Tensor output, int64_t tensor_layout) {
+    transpose_pad_permute_cuda(input, output, static_cast<int>(tensor_layout));
+}
+
+static void scale_fuse_quant_cuda_wrap(
+    at::Tensor input, at::Tensor output, at::Tensor scale,
+    int64_t num_tokens, double scale_max, int64_t tensor_layout) {
+    scale_fuse_quant_cuda(
+        input, output, scale,
+        static_cast<int>(num_tokens), static_cast<float>(scale_max), static_cast<int>(tensor_layout));
+}
+
+static void mean_scale_fuse_quant_cuda_wrap(
+    at::Tensor input, at::Tensor output, at::Tensor mean, at::Tensor scale,
+    int64_t num_tokens, double scale_max, int64_t tensor_layout) {
+    mean_scale_fuse_quant_cuda(
+        input, output, mean, scale,
+        static_cast<int>(num_tokens), static_cast<float>(scale_max), static_cast<int>(tensor_layout));
+}
\ No newline at end of file