diff --git a/CMakeLists.txt b/CMakeLists.txt index f9d94d9e27b0dea56a4cb9809201bcf3f72f4f78..06784b4e0515d9cdcbaeae31660a0b1faf682703 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,6 +142,7 @@ set(_qattn_sm90_SRC "sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu" "sage_attention/qattn/attn_cuda_sm90.h" "sage_attention/qattn/attn_utils.cuh" +"sage_attention/cuda_tensormap_shim.cuh" ) # TODO: check if CLion support this: diff --git a/build.toml b/build.toml index 60e0eefd1beefca4cdcb0e017a7747d6b2b9e0b2..5859c1b4ec49cb40088f939bb0d26b1b77f462de 100644 --- a/build.toml +++ b/build.toml @@ -1,21 +1,20 @@ [general] name = "sage_attention" universal = false +cuda-minver = "12.4" [torch] src = [ "torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h", ] -cuda-capabilities = [ - "8.0", "9.0" -] [kernel._qattn] depends = ["torch"] backend = "cuda" +cuda-minver = "12.4" cuda-capabilities = [ - "9.0" + "8.0", "8.9", "9.0a" ] src = [ "sage_attention/cp_async.cuh", @@ -27,6 +26,7 @@ src = [ "sage_attention/reduction_utils.cuh", "sage_attention/wgmma.cuh", "sage_attention/utils.cuh", + "sage_attention/cuda_tensormap_shim.cuh", ] cxx-flags = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"] cuda-flags = [ @@ -43,6 +43,7 @@ cuda-flags = [ [kernel._qattn_sm80] depends = ["torch"] backend = "cuda" +cuda-minver = "12.4" cuda-capabilities = [ "8.0" ] @@ -68,6 +69,7 @@ cuda-flags = [ [kernel._qattn_sm89] depends = ["torch"] backend = "cuda" +cuda-minver = "12.4" cuda-capabilities = [ "8.9", ] @@ -100,8 +102,9 @@ cuda-flags = [ [kernel._qattn_sm90] depends = ["torch"] backend = "cuda" +cuda-minver = "12.4" cuda-capabilities = [ - "9.0", + "9.0a", ] include = ["."] src = [ @@ -124,8 +127,9 @@ cuda-flags = [ [kernel._fused] depends = ["torch"] backend = "cuda" +cuda-minver = "12.4" cuda-capabilities = [ - "9.0", + "8.0", "8.9", "9.0a", ] include = ["."] src = [ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762 --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py @@ -0,0 +1,12 @@ +from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 +from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda + + +__all__ = [ + "per_block_int8", + "per_warp_int8", + "sub_mean", + "per_channel_fp8", + "sageattn", + "sageattn_qk_int8_pv_fp8_cuda", +] \ No newline at end of file diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2dd95ad76c739a10e50fa3582787c6f2b2b7719d Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9588c3ce2f047661a943ab8cef11327e921545cb Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..104e4122e8e0b2ab21b6ddc95c5b3f432d3a7736 Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94f8d4797d4233c6a590b019bfe1950aca586f5c Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8047ade599c0570f9a869e5f1e4406f8ec35c444 Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309 --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _sage_attention_44b112f_dirty +ops = torch.ops._sage_attention_44b112f_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_sage_attention_44b112f_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..8d67979089a7c21cfb93c3a6232245d4ed307168 --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b577da1986b76b2571e8dd55412621e6fc85fe1a2f847bc0a5af9851bf388cf2 +size 26037568 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py new file mode 100644 index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py @@ -0,0 +1,983 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import torch.nn.functional as F + +from ._ops import ops + + +from .quant import per_warp_int8 as per_warp_int8_cuda +from .quant import sub_mean +from .quant import per_channel_fp8 +from .quant_per_thread import per_thread_int8 as per_thread_int8_triton + +from typing import Any, List, Literal, Optional, Tuple, Union +import warnings + + +import subprocess +import re + + +def get_cuda_version(): + try: + output = subprocess.check_output(["nvcc", "--version"]).decode() + match = re.search(r"release (\d+)\.(\d+)", output) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + return major, minor + except Exception as e: + print("Failed to get CUDA version:", e) + return None, None + + +def get_cuda_arch_versions(): + cuda_archs = [] + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + cuda_archs.append(f"sm{major}{minor}") + return cuda_archs + + +def sageattn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + sm_scale: Optional[float] = None, + return_lse: bool = False, + **kwargs: Any, +): + """ + Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + """ + + arch = get_cuda_arch_versions()[q.device.index] + if arch == "sm80": + return sageattn_qk_int8_pv_fp16_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32", + ) + elif arch == "sm89": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) + elif arch == "sm90": + return sageattn_qk_int8_pv_fp8_cuda_sm90( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp32", + ) + elif arch == "sm120": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + qk_quant_gran="per_warp", + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120. + else: + raise ValueError(f"Unsupported CUDA architecture: {arch}") + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp16_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32". + - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b). + - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead. + - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + WARPK=64, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v: + warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16": + if smooth_v: + smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + q_int8, + k_int8, + smoothed_v, + o, + q_scale, + k_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16+fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}") + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp16", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # cuda_major_version, cuda_minor_version = get_cuda_version() + # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16': + # warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'") + # pv_accum_dtype = 'fp32+fp32' + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64 + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype == "fp32+fp32" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32+fp16" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.") + smooth_v = False + + quant_v_scale_max = 448.0 + if pv_accum_dtype == "fp32+fp16": + quant_v_scale_max = 2.25 + + v_fp8, v_scale, vm = per_channel_fp8( + v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v + ) + print("before kernel call") + if pv_accum_dtype == "fp32": + if smooth_v: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + else: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp32": + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp16": + lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + o = o[..., :head_dim_og] + print("after kernel call") + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda_sm90( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp32", + smooth_k: bool = True, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=64, + WARPQ=16, + BLKK=128, + WARPK=128, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + # pad v to multiple of 128 + # TODO: modify per_channel_fp8 kernel to handle this + kv_len = k.size(seq_dim) + v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0 + if v_pad_len > 0: + if tensor_layout == "HND": + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v.size(1), + v_pad_len, + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=2, + ) + else: + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v_pad_len, + v.size(2), + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=1, + ) + + v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False) + + if pv_accum_dtype == "fp32": + raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.") + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp32+fp32": + print( + "qint8", + q_int8.shape, + "qscale", + q_scale.shape, + "kint8", + k_int8.shape, + "kscale", + k_scale.shape, + "vfp8", + v_fp8.shape, + "vscale", + v_scale.shape, + ) + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462 --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py @@ -0,0 +1,326 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +from typing import Optional + +from ._ops import ops + + +def per_block_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + BLKK: int = 64, + sm_scale: Optional[float] = None, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` and the key tensor `k` with per block quantization. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + sm_scale : Optional[float] + The scale factor for the softmax operation. Default is ``head_dim**-0.5``. + It will be multiplied by ``1.44269504`` to work together with the triton attention kernel. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32 + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + sm_scale *= 1.44269504 + + ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout) + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def per_warp_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + WARPQ: int = 32, + BLKK: int = 64, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization. + Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128. + Block size of quantizing `k` is 64 or 128. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)), + device=q.device, + dtype=torch.float32, + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout) + + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"): + """ + Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor] + A tuple containing: + - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype. + - The returned mean tensor will have the same dtype as the input tensor. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + vm = v.mean(dim=1 if _tensor_layout == 0 else 2) + + v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device) + + # subtract mean and store the result as fp16 + ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout) + + return v_smoothed, vm + + +def per_channel_fp8( + v: torch.Tensor, + tensor_layout: str = "HND", + scale_max: float = 448.0, + smooth_v: bool = True, +): + """ + Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization. + `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64. + After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``. + The quantization is done per channel, with the scale value and smooth factor calculated per channel. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + scale_max : float + The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format). + + smooth_v : bool + Whether to smooth the quantized tensor. Default is True. + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] + A tuple containing: + - The quantized tensor `v_fp8`. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + if tensor_layout == "HND": + b, h_kv, kv_len, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device + ) + + elif tensor_layout == "NHD": + b, kv_len, h_kv, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device + ) + + ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout) + + v_fp8 = torch.empty( + v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device + ) + + v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + + if smooth_v: + ops.mean_scale_fuse_quant_cuda( + v_transposed_permutted, + v_fp8, + vm, + v_scale, + kv_len, + scale_max, + _tensor_layout, + ) + return v_fp8, v_scale, vm + else: + ops.scale_fuse_quant_cuda( + v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout + ) + return v_fp8, v_scale, None diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34 --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py @@ -0,0 +1,204 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import triton +import triton.language as tl + +@triton.jit +def quant_query_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + # offs_k = tl.arange(0, C) + + # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + # x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + # x = x.to(tl.float32) + # scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + # x_int8 = x / scale + # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + # x_int8 = x_int8.to(tl.int8) + # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + # tl.store(scale_ptrs, scale) + + offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1 + offs_k = tl.arange(0, C) + + input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :] + input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :] + output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :] + output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L) + x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L) + x0 = x0.to(tl.float32) + x1 = x1.to(tl.float32) + scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001 + x0_int8 = x0 / scale + x1_int8 = x1 / scale + x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1) + x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1) + x0_int8 = x0_int8.to(tl.int8) + x1_int8 = x1_int8.to(tl.int8) + tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L) + tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_query_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"): + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if km is not None: + k = k - km + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1) + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32) + k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b) + quant_query_per_thread_int8_kernel[grid]( + q, q_int8, q_scale, qo_len, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_qo, stride_h_qo, stride_seq_qo, + q_scale.stride(0), q_scale.stride(1), + C=head_dim, BLK=WARPQ + ) + + grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b) + quant_key_per_thread_int8_kernel[grid]( + k, k_int8, k_scale, kv_len, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_ko, stride_h_ko, stride_seq_ko, + k_scale.stride(0), k_scale.stride(1), + C=head_dim, BLK=WARPK + ) + + return q_int8, q_scale, k_int8, k_scale \ No newline at end of file diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762 --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py @@ -0,0 +1,12 @@ +from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 +from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda + + +__all__ = [ + "per_block_int8", + "per_warp_int8", + "sub_mean", + "per_channel_fp8", + "sageattn", + "sageattn_qk_int8_pv_fp8_cuda", +] \ No newline at end of file diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6a109e1ab12f4cc4b21a8573d182233206d911f Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2298b2decd5b4e5fcb76a03f1155d71e41292ad5 Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7face6a40bbed59ea000478be0b1385bcf803b4 Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e68303a9974569a9c4734e1a7029914b49e77bb Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bba733f559eaa89d8b281bc99f2b3992b38cc114 Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309 --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _sage_attention_44b112f_dirty +ops = torch.ops._sage_attention_44b112f_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_sage_attention_44b112f_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..c45bfd2be6bcfb77552fff0eee73b5ebe19f6452 --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d47c952dd9781283ff0dcbd533779de33b0bfa1966dcc0cc8accd0412217c1c5 +size 26553840 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py new file mode 100644 index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py @@ -0,0 +1,983 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import torch.nn.functional as F + +from ._ops import ops + + +from .quant import per_warp_int8 as per_warp_int8_cuda +from .quant import sub_mean +from .quant import per_channel_fp8 +from .quant_per_thread import per_thread_int8 as per_thread_int8_triton + +from typing import Any, List, Literal, Optional, Tuple, Union +import warnings + + +import subprocess +import re + + +def get_cuda_version(): + try: + output = subprocess.check_output(["nvcc", "--version"]).decode() + match = re.search(r"release (\d+)\.(\d+)", output) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + return major, minor + except Exception as e: + print("Failed to get CUDA version:", e) + return None, None + + +def get_cuda_arch_versions(): + cuda_archs = [] + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + cuda_archs.append(f"sm{major}{minor}") + return cuda_archs + + +def sageattn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + sm_scale: Optional[float] = None, + return_lse: bool = False, + **kwargs: Any, +): + """ + Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + """ + + arch = get_cuda_arch_versions()[q.device.index] + if arch == "sm80": + return sageattn_qk_int8_pv_fp16_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32", + ) + elif arch == "sm89": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) + elif arch == "sm90": + return sageattn_qk_int8_pv_fp8_cuda_sm90( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp32", + ) + elif arch == "sm120": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + qk_quant_gran="per_warp", + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120. + else: + raise ValueError(f"Unsupported CUDA architecture: {arch}") + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp16_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32". + - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b). + - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead. + - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + WARPK=64, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v: + warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16": + if smooth_v: + smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + q_int8, + k_int8, + smoothed_v, + o, + q_scale, + k_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16+fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}") + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp16", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # cuda_major_version, cuda_minor_version = get_cuda_version() + # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16': + # warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'") + # pv_accum_dtype = 'fp32+fp32' + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64 + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype == "fp32+fp32" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32+fp16" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.") + smooth_v = False + + quant_v_scale_max = 448.0 + if pv_accum_dtype == "fp32+fp16": + quant_v_scale_max = 2.25 + + v_fp8, v_scale, vm = per_channel_fp8( + v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v + ) + print("before kernel call") + if pv_accum_dtype == "fp32": + if smooth_v: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + else: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp32": + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp16": + lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + o = o[..., :head_dim_og] + print("after kernel call") + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda_sm90( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp32", + smooth_k: bool = True, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=64, + WARPQ=16, + BLKK=128, + WARPK=128, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + # pad v to multiple of 128 + # TODO: modify per_channel_fp8 kernel to handle this + kv_len = k.size(seq_dim) + v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0 + if v_pad_len > 0: + if tensor_layout == "HND": + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v.size(1), + v_pad_len, + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=2, + ) + else: + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v_pad_len, + v.size(2), + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=1, + ) + + v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False) + + if pv_accum_dtype == "fp32": + raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.") + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp32+fp32": + print( + "qint8", + q_int8.shape, + "qscale", + q_scale.shape, + "kint8", + k_int8.shape, + "kscale", + k_scale.shape, + "vfp8", + v_fp8.shape, + "vscale", + v_scale.shape, + ) + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462 --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py @@ -0,0 +1,326 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +from typing import Optional + +from ._ops import ops + + +def per_block_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + BLKK: int = 64, + sm_scale: Optional[float] = None, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` and the key tensor `k` with per block quantization. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + sm_scale : Optional[float] + The scale factor for the softmax operation. Default is ``head_dim**-0.5``. + It will be multiplied by ``1.44269504`` to work together with the triton attention kernel. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32 + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + sm_scale *= 1.44269504 + + ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout) + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def per_warp_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + WARPQ: int = 32, + BLKK: int = 64, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization. + Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128. + Block size of quantizing `k` is 64 or 128. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)), + device=q.device, + dtype=torch.float32, + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout) + + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"): + """ + Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor] + A tuple containing: + - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype. + - The returned mean tensor will have the same dtype as the input tensor. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + vm = v.mean(dim=1 if _tensor_layout == 0 else 2) + + v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device) + + # subtract mean and store the result as fp16 + ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout) + + return v_smoothed, vm + + +def per_channel_fp8( + v: torch.Tensor, + tensor_layout: str = "HND", + scale_max: float = 448.0, + smooth_v: bool = True, +): + """ + Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization. + `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64. + After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``. + The quantization is done per channel, with the scale value and smooth factor calculated per channel. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + scale_max : float + The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format). + + smooth_v : bool + Whether to smooth the quantized tensor. Default is True. + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] + A tuple containing: + - The quantized tensor `v_fp8`. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + if tensor_layout == "HND": + b, h_kv, kv_len, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device + ) + + elif tensor_layout == "NHD": + b, kv_len, h_kv, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device + ) + + ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout) + + v_fp8 = torch.empty( + v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device + ) + + v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + + if smooth_v: + ops.mean_scale_fuse_quant_cuda( + v_transposed_permutted, + v_fp8, + vm, + v_scale, + kv_len, + scale_max, + _tensor_layout, + ) + return v_fp8, v_scale, vm + else: + ops.scale_fuse_quant_cuda( + v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout + ) + return v_fp8, v_scale, None diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34 --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py @@ -0,0 +1,204 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import triton +import triton.language as tl + +@triton.jit +def quant_query_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + # offs_k = tl.arange(0, C) + + # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + # x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + # x = x.to(tl.float32) + # scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + # x_int8 = x / scale + # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + # x_int8 = x_int8.to(tl.int8) + # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + # tl.store(scale_ptrs, scale) + + offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1 + offs_k = tl.arange(0, C) + + input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :] + input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :] + output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :] + output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L) + x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L) + x0 = x0.to(tl.float32) + x1 = x1.to(tl.float32) + scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001 + x0_int8 = x0 / scale + x1_int8 = x1 / scale + x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1) + x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1) + x0_int8 = x0_int8.to(tl.int8) + x1_int8 = x1_int8.to(tl.int8) + tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L) + tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_query_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"): + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if km is not None: + k = k - km + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1) + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32) + k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b) + quant_query_per_thread_int8_kernel[grid]( + q, q_int8, q_scale, qo_len, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_qo, stride_h_qo, stride_seq_qo, + q_scale.stride(0), q_scale.stride(1), + C=head_dim, BLK=WARPQ + ) + + grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b) + quant_key_per_thread_int8_kernel[grid]( + k, k_int8, k_scale, kv_len, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_ko, stride_h_ko, stride_seq_ko, + k_scale.stride(0), k_scale.stride(1), + C=head_dim, BLK=WARPK + ) + + return q_int8, q_scale, k_int8, k_scale \ No newline at end of file diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762 --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py @@ -0,0 +1,12 @@ +from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 +from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda + + +__all__ = [ + "per_block_int8", + "per_warp_int8", + "sub_mean", + "per_channel_fp8", + "sageattn", + "sageattn_qk_int8_pv_fp8_cuda", +] \ No newline at end of file diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4353512b2dfee72b0c59c3c743b89d1a6d21d53c Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..641efed3dee386c39c00ddfa4211d067d486a7ee Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1db66c5c7ef07688c5a4552f4b3d587c87edbcc5 Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee3c8da8c38b79d4bad4db9d7a0389bf16df47e3 Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdee849699e5bd19bfe27146e0a151b62d1fb069 Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309 --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _sage_attention_44b112f_dirty +ops = torch.ops._sage_attention_44b112f_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_sage_attention_44b112f_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..84d12f5fd0187b990f9764433cd7efce32de6cf0 --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28e181de0c6388653fb4b8b2d7347f1f547fc84fe7dc45bc66db9b1431d141bc +size 26037392 diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py new file mode 100644 index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py @@ -0,0 +1,983 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import torch.nn.functional as F + +from ._ops import ops + + +from .quant import per_warp_int8 as per_warp_int8_cuda +from .quant import sub_mean +from .quant import per_channel_fp8 +from .quant_per_thread import per_thread_int8 as per_thread_int8_triton + +from typing import Any, List, Literal, Optional, Tuple, Union +import warnings + + +import subprocess +import re + + +def get_cuda_version(): + try: + output = subprocess.check_output(["nvcc", "--version"]).decode() + match = re.search(r"release (\d+)\.(\d+)", output) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + return major, minor + except Exception as e: + print("Failed to get CUDA version:", e) + return None, None + + +def get_cuda_arch_versions(): + cuda_archs = [] + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + cuda_archs.append(f"sm{major}{minor}") + return cuda_archs + + +def sageattn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + sm_scale: Optional[float] = None, + return_lse: bool = False, + **kwargs: Any, +): + """ + Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + """ + + arch = get_cuda_arch_versions()[q.device.index] + if arch == "sm80": + return sageattn_qk_int8_pv_fp16_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32", + ) + elif arch == "sm89": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) + elif arch == "sm90": + return sageattn_qk_int8_pv_fp8_cuda_sm90( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp32", + ) + elif arch == "sm120": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + qk_quant_gran="per_warp", + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120. + else: + raise ValueError(f"Unsupported CUDA architecture: {arch}") + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp16_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32". + - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b). + - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead. + - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + WARPK=64, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v: + warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16": + if smooth_v: + smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + q_int8, + k_int8, + smoothed_v, + o, + q_scale, + k_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16+fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}") + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp16", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # cuda_major_version, cuda_minor_version = get_cuda_version() + # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16': + # warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'") + # pv_accum_dtype = 'fp32+fp32' + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64 + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype == "fp32+fp32" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32+fp16" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.") + smooth_v = False + + quant_v_scale_max = 448.0 + if pv_accum_dtype == "fp32+fp16": + quant_v_scale_max = 2.25 + + v_fp8, v_scale, vm = per_channel_fp8( + v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v + ) + print("before kernel call") + if pv_accum_dtype == "fp32": + if smooth_v: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + else: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp32": + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp16": + lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + o = o[..., :head_dim_og] + print("after kernel call") + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda_sm90( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp32", + smooth_k: bool = True, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=64, + WARPQ=16, + BLKK=128, + WARPK=128, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + # pad v to multiple of 128 + # TODO: modify per_channel_fp8 kernel to handle this + kv_len = k.size(seq_dim) + v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0 + if v_pad_len > 0: + if tensor_layout == "HND": + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v.size(1), + v_pad_len, + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=2, + ) + else: + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v_pad_len, + v.size(2), + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=1, + ) + + v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False) + + if pv_accum_dtype == "fp32": + raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.") + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp32+fp32": + print( + "qint8", + q_int8.shape, + "qscale", + q_scale.shape, + "kint8", + k_int8.shape, + "kscale", + k_scale.shape, + "vfp8", + v_fp8.shape, + "vscale", + v_scale.shape, + ) + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462 --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py @@ -0,0 +1,326 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +from typing import Optional + +from ._ops import ops + + +def per_block_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + BLKK: int = 64, + sm_scale: Optional[float] = None, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` and the key tensor `k` with per block quantization. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + sm_scale : Optional[float] + The scale factor for the softmax operation. Default is ``head_dim**-0.5``. + It will be multiplied by ``1.44269504`` to work together with the triton attention kernel. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32 + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + sm_scale *= 1.44269504 + + ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout) + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def per_warp_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + WARPQ: int = 32, + BLKK: int = 64, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization. + Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128. + Block size of quantizing `k` is 64 or 128. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)), + device=q.device, + dtype=torch.float32, + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout) + + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"): + """ + Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor] + A tuple containing: + - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype. + - The returned mean tensor will have the same dtype as the input tensor. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + vm = v.mean(dim=1 if _tensor_layout == 0 else 2) + + v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device) + + # subtract mean and store the result as fp16 + ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout) + + return v_smoothed, vm + + +def per_channel_fp8( + v: torch.Tensor, + tensor_layout: str = "HND", + scale_max: float = 448.0, + smooth_v: bool = True, +): + """ + Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization. + `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64. + After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``. + The quantization is done per channel, with the scale value and smooth factor calculated per channel. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + scale_max : float + The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format). + + smooth_v : bool + Whether to smooth the quantized tensor. Default is True. + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] + A tuple containing: + - The quantized tensor `v_fp8`. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + if tensor_layout == "HND": + b, h_kv, kv_len, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device + ) + + elif tensor_layout == "NHD": + b, kv_len, h_kv, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device + ) + + ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout) + + v_fp8 = torch.empty( + v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device + ) + + v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + + if smooth_v: + ops.mean_scale_fuse_quant_cuda( + v_transposed_permutted, + v_fp8, + vm, + v_scale, + kv_len, + scale_max, + _tensor_layout, + ) + return v_fp8, v_scale, vm + else: + ops.scale_fuse_quant_cuda( + v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout + ) + return v_fp8, v_scale, None diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34 --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py @@ -0,0 +1,204 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import triton +import triton.language as tl + +@triton.jit +def quant_query_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + # offs_k = tl.arange(0, C) + + # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + # x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + # x = x.to(tl.float32) + # scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + # x_int8 = x / scale + # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + # x_int8 = x_int8.to(tl.int8) + # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + # tl.store(scale_ptrs, scale) + + offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1 + offs_k = tl.arange(0, C) + + input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :] + input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :] + output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :] + output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L) + x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L) + x0 = x0.to(tl.float32) + x1 = x1.to(tl.float32) + scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001 + x0_int8 = x0 / scale + x1_int8 = x1 / scale + x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1) + x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1) + x0_int8 = x0_int8.to(tl.int8) + x1_int8 = x1_int8.to(tl.int8) + tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L) + tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_query_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"): + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if km is not None: + k = k - km + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1) + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32) + k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b) + quant_query_per_thread_int8_kernel[grid]( + q, q_int8, q_scale, qo_len, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_qo, stride_h_qo, stride_seq_qo, + q_scale.stride(0), q_scale.stride(1), + C=head_dim, BLK=WARPQ + ) + + grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b) + quant_key_per_thread_int8_kernel[grid]( + k, k_int8, k_scale, kv_len, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_ko, stride_h_ko, stride_seq_ko, + k_scale.stride(0), k_scale.stride(1), + C=head_dim, BLK=WARPK + ) + + return q_int8, q_scale, k_int8, k_scale \ No newline at end of file diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762 --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py @@ -0,0 +1,12 @@ +from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 +from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda + + +__all__ = [ + "per_block_int8", + "per_warp_int8", + "sub_mean", + "per_channel_fp8", + "sageattn", + "sageattn_qk_int8_pv_fp8_cuda", +] \ No newline at end of file diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad915a1553f70a034d95cf30d35f65f414cdddf4 Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b15635cec94a98d3a8ae65de7d8470620cdc1ca5 Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d862b1f4b01b67e8739eace8ac79a35d6a0fb55e Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a90a9e0a4b0b2c71e99fb920efbddfbe0d73c8b3 Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f05baf4ed4fda1f06cdf6559aeac4612bf2413df Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309 --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _sage_attention_44b112f_dirty +ops = torch.ops._sage_attention_44b112f_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_sage_attention_44b112f_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..8f2d058f3d5e4c696f8dafa57189b7263e0c607d --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826ab66e6c33b3b2b17c30371934a55e972d560197c5492f4dedf6fcc29f1a1e +size 26553920 diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py new file mode 100644 index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py @@ -0,0 +1,983 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import torch.nn.functional as F + +from ._ops import ops + + +from .quant import per_warp_int8 as per_warp_int8_cuda +from .quant import sub_mean +from .quant import per_channel_fp8 +from .quant_per_thread import per_thread_int8 as per_thread_int8_triton + +from typing import Any, List, Literal, Optional, Tuple, Union +import warnings + + +import subprocess +import re + + +def get_cuda_version(): + try: + output = subprocess.check_output(["nvcc", "--version"]).decode() + match = re.search(r"release (\d+)\.(\d+)", output) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + return major, minor + except Exception as e: + print("Failed to get CUDA version:", e) + return None, None + + +def get_cuda_arch_versions(): + cuda_archs = [] + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + cuda_archs.append(f"sm{major}{minor}") + return cuda_archs + + +def sageattn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + sm_scale: Optional[float] = None, + return_lse: bool = False, + **kwargs: Any, +): + """ + Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + """ + + arch = get_cuda_arch_versions()[q.device.index] + if arch == "sm80": + return sageattn_qk_int8_pv_fp16_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32", + ) + elif arch == "sm89": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) + elif arch == "sm90": + return sageattn_qk_int8_pv_fp8_cuda_sm90( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp32", + ) + elif arch == "sm120": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + qk_quant_gran="per_warp", + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120. + else: + raise ValueError(f"Unsupported CUDA architecture: {arch}") + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp16_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32". + - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b). + - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead. + - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + WARPK=64, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v: + warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16": + if smooth_v: + smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + q_int8, + k_int8, + smoothed_v, + o, + q_scale, + k_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16+fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}") + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp16", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # cuda_major_version, cuda_minor_version = get_cuda_version() + # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16': + # warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'") + # pv_accum_dtype = 'fp32+fp32' + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64 + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype == "fp32+fp32" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32+fp16" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.") + smooth_v = False + + quant_v_scale_max = 448.0 + if pv_accum_dtype == "fp32+fp16": + quant_v_scale_max = 2.25 + + v_fp8, v_scale, vm = per_channel_fp8( + v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v + ) + print("before kernel call") + if pv_accum_dtype == "fp32": + if smooth_v: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + else: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp32": + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp16": + lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + o = o[..., :head_dim_og] + print("after kernel call") + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda_sm90( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp32", + smooth_k: bool = True, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=64, + WARPQ=16, + BLKK=128, + WARPK=128, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + # pad v to multiple of 128 + # TODO: modify per_channel_fp8 kernel to handle this + kv_len = k.size(seq_dim) + v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0 + if v_pad_len > 0: + if tensor_layout == "HND": + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v.size(1), + v_pad_len, + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=2, + ) + else: + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v_pad_len, + v.size(2), + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=1, + ) + + v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False) + + if pv_accum_dtype == "fp32": + raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.") + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp32+fp32": + print( + "qint8", + q_int8.shape, + "qscale", + q_scale.shape, + "kint8", + k_int8.shape, + "kscale", + k_scale.shape, + "vfp8", + v_fp8.shape, + "vscale", + v_scale.shape, + ) + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462 --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py @@ -0,0 +1,326 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +from typing import Optional + +from ._ops import ops + + +def per_block_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + BLKK: int = 64, + sm_scale: Optional[float] = None, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` and the key tensor `k` with per block quantization. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + sm_scale : Optional[float] + The scale factor for the softmax operation. Default is ``head_dim**-0.5``. + It will be multiplied by ``1.44269504`` to work together with the triton attention kernel. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32 + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + sm_scale *= 1.44269504 + + ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout) + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def per_warp_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + WARPQ: int = 32, + BLKK: int = 64, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization. + Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128. + Block size of quantizing `k` is 64 or 128. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)), + device=q.device, + dtype=torch.float32, + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout) + + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"): + """ + Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor] + A tuple containing: + - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype. + - The returned mean tensor will have the same dtype as the input tensor. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + vm = v.mean(dim=1 if _tensor_layout == 0 else 2) + + v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device) + + # subtract mean and store the result as fp16 + ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout) + + return v_smoothed, vm + + +def per_channel_fp8( + v: torch.Tensor, + tensor_layout: str = "HND", + scale_max: float = 448.0, + smooth_v: bool = True, +): + """ + Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization. + `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64. + After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``. + The quantization is done per channel, with the scale value and smooth factor calculated per channel. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + scale_max : float + The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format). + + smooth_v : bool + Whether to smooth the quantized tensor. Default is True. + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] + A tuple containing: + - The quantized tensor `v_fp8`. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + if tensor_layout == "HND": + b, h_kv, kv_len, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device + ) + + elif tensor_layout == "NHD": + b, kv_len, h_kv, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device + ) + + ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout) + + v_fp8 = torch.empty( + v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device + ) + + v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + + if smooth_v: + ops.mean_scale_fuse_quant_cuda( + v_transposed_permutted, + v_fp8, + vm, + v_scale, + kv_len, + scale_max, + _tensor_layout, + ) + return v_fp8, v_scale, vm + else: + ops.scale_fuse_quant_cuda( + v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout + ) + return v_fp8, v_scale, None diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34 --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py @@ -0,0 +1,204 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import triton +import triton.language as tl + +@triton.jit +def quant_query_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + # offs_k = tl.arange(0, C) + + # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + # x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + # x = x.to(tl.float32) + # scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + # x_int8 = x / scale + # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + # x_int8 = x_int8.to(tl.int8) + # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + # tl.store(scale_ptrs, scale) + + offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1 + offs_k = tl.arange(0, C) + + input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :] + input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :] + output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :] + output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L) + x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L) + x0 = x0.to(tl.float32) + x1 = x1.to(tl.float32) + scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001 + x0_int8 = x0 / scale + x1_int8 = x1 / scale + x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1) + x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1) + x0_int8 = x0_int8.to(tl.int8) + x1_int8 = x1_int8.to(tl.int8) + tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L) + tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_query_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"): + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if km is not None: + k = k - km + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1) + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32) + k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b) + quant_query_per_thread_int8_kernel[grid]( + q, q_int8, q_scale, qo_len, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_qo, stride_h_qo, stride_seq_qo, + q_scale.stride(0), q_scale.stride(1), + C=head_dim, BLK=WARPQ + ) + + grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b) + quant_key_per_thread_int8_kernel[grid]( + k, k_int8, k_scale, kv_len, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_ko, stride_h_ko, stride_seq_ko, + k_scale.stride(0), k_scale.stride(1), + C=head_dim, BLK=WARPK + ) + + return q_int8, q_scale, k_int8, k_scale \ No newline at end of file diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5df69e92664edfbd0820d5126792e41e23a72762 --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py @@ -0,0 +1,12 @@ +from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 +from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda + + +__all__ = [ + "per_block_int8", + "per_warp_int8", + "sub_mean", + "per_channel_fp8", + "sageattn", + "sageattn_qk_int8_pv_fp8_cuda", +] \ No newline at end of file diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1dfb4a20911742e46041373d8bd10f01e6a83afa Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d6048f0206f7d3fe5f01106837b0145a66b1df7 Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..674578609f75fb9d129c302f5e2efd5e9dd88869 Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b1288e6a808145a21813915428772c5151fffe9 Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d7155cabca1497530d3d266f4ec3b647f995f95 Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc56aab172e24e052a9f0c78060ecfbdff00309 --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _sage_attention_44b112f_dirty +ops = torch.ops._sage_attention_44b112f_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_sage_attention_44b112f_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..736613dcbbc913f7a1c538e9d383e65f98fe5f52 --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:824faeacc05dc7d676acaa9a005d5f4d7e62f47c361eb58a085f020e21fde29e +size 26612144 diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py new file mode 100644 index 0000000000000000000000000000000000000000..dc44a8e1ee17a5c5c65da5adda6faf9228cca55e --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py @@ -0,0 +1,983 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import torch.nn.functional as F + +from ._ops import ops + + +from .quant import per_warp_int8 as per_warp_int8_cuda +from .quant import sub_mean +from .quant import per_channel_fp8 +from .quant_per_thread import per_thread_int8 as per_thread_int8_triton + +from typing import Any, List, Literal, Optional, Tuple, Union +import warnings + + +import subprocess +import re + + +def get_cuda_version(): + try: + output = subprocess.check_output(["nvcc", "--version"]).decode() + match = re.search(r"release (\d+)\.(\d+)", output) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + return major, minor + except Exception as e: + print("Failed to get CUDA version:", e) + return None, None + + +def get_cuda_arch_versions(): + cuda_archs = [] + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + cuda_archs.append(f"sm{major}{minor}") + return cuda_archs + + +def sageattn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + sm_scale: Optional[float] = None, + return_lse: bool = False, + **kwargs: Any, +): + """ + Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + """ + + arch = get_cuda_arch_versions()[q.device.index] + if arch == "sm80": + return sageattn_qk_int8_pv_fp16_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32", + ) + elif arch == "sm89": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) + elif arch == "sm90": + return sageattn_qk_int8_pv_fp8_cuda_sm90( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp32", + ) + elif arch == "sm120": + return sageattn_qk_int8_pv_fp8_cuda( + q, + k, + v, + tensor_layout=tensor_layout, + is_causal=is_causal, + qk_quant_gran="per_warp", + sm_scale=sm_scale, + return_lse=return_lse, + pv_accum_dtype="fp32+fp16", + ) # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120. + else: + raise ValueError(f"Unsupported CUDA architecture: {arch}") + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp16_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32". + - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b). + - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead. + - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=128, + WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32), + BLKK=64, + WARPK=64, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v: + warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16": + if smooth_v: + smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + q_int8, + k_int8, + smoothed_v, + o, + q_scale, + k_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp16+fp32": + v = v.to(torch.float16) + lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + q_int8, + k_int8, + v, + o, + q_scale, + k_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + else: + raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}") + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp16", + smooth_k: bool = True, + smooth_v: bool = False, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + smooth_v : bool + Whether to smooth the value tensor by subtracting the mean along the sequence dimension. + smooth_v will be ignored if pv_accum_dtype is "fp32+fp32". + Default: False. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + # cuda_major_version, cuda_minor_version = get_cuda_version() + # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16': + # warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'") + # pv_accum_dtype = 'fp32+fp32' + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64 + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + if pv_accum_dtype == "fp32+fp32" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.") + smooth_v = False + + if pv_accum_dtype == "fp32+fp16" and smooth_v: + warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.") + smooth_v = False + + quant_v_scale_max = 448.0 + if pv_accum_dtype == "fp32+fp16": + quant_v_scale_max = 2.25 + + v_fp8, v_scale, vm = per_channel_fp8( + v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v + ) + print("before kernel call") + if pv_accum_dtype == "fp32": + if smooth_v: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + vm, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + else: + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp32": + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + elif pv_accum_dtype == "fp32+fp16": + lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + torch.cuda.synchronize() + o = o[..., :head_dim_og] + print("after kernel call") + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o + + +@torch.compiler.disable +def sageattn_qk_int8_pv_fp8_cuda_sm90( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + is_causal: bool = False, + qk_quant_gran: str = "per_thread", + sm_scale: Optional[float] = None, + pv_accum_dtype: str = "fp32+fp32", + smooth_k: bool = True, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + qk_quant_gran : str + The granularity of quantization for Q and K, either "per_warp" or "per_thread". + Default: "per_thread". + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + pv_accum_dtype : str + The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32". + - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator. + - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy. + Default: "fp32+fp32". + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], ( + "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + ) + assert qk_quant_gran in ["per_warp", "per_thread"], ( + "qk_quant_gran must be either 'per_warp' or 'per_thread'." + ) + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + torch.cuda.set_device(v.device) + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + _is_caual = 1 if is_causal else 0 + _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2 + _return_lse = 1 if return_lse else 0 + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, ( + "Last dim of qkv must be contiguous." + ) + + if sm_scale is None: + sm_scale = head_dim_og**-0.5 + + seq_dim = 1 if _tensor_layout == 0 else 2 + nh_dim = 2 if _tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = ( + torch.matmul( + q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3) + ) + .squeeze(-1) + .to(torch.float32) + ) + else: + lse_correction = ( + torch.matmul(q, km_broadcast.transpose(2, 3)) + .squeeze(-1) + .to(torch.float32) + ) + else: + km = None + + if qk_quant_gran == "per_warp": + q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda( + q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128 + ) + elif qk_quant_gran == "per_thread": + q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton( + q, + k, + km, + tensor_layout=tensor_layout, + BLKQ=64, + WARPQ=16, + BLKK=128, + WARPK=128, + ) + + o = torch.empty(q.size(), dtype=dtype, device=q.device) + + # pad v to multiple of 128 + # TODO: modify per_channel_fp8 kernel to handle this + kv_len = k.size(seq_dim) + v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0 + if v_pad_len > 0: + if tensor_layout == "HND": + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v.size(1), + v_pad_len, + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=2, + ) + else: + v = torch.cat( + [ + v, + torch.zeros( + v.size(0), + v_pad_len, + v.size(2), + v.size(3), + dtype=v.dtype, + device=v.device, + ), + ], + dim=1, + ) + + v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False) + + if pv_accum_dtype == "fp32": + raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.") + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + elif pv_accum_dtype == "fp32+fp32": + print( + "qint8", + q_int8.shape, + "qscale", + q_scale.shape, + "kint8", + k_int8.shape, + "kscale", + k_scale.shape, + "vfp8", + v_fp8.shape, + "vscale", + v_scale.shape, + ) + lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( + q_int8, + k_int8, + v_fp8, + o, + q_scale, + k_scale, + v_scale, + _tensor_layout, + _is_caual, + _qk_quant_gran, + sm_scale, + _return_lse, + ) + + o = o[..., :head_dim_og] + + if return_lse: + return ( + o, + lse / 1.44269504 + lse_correction * sm_scale + if smooth_k + else lse / 1.44269504, + ) + else: + return o diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7a32c6a1e502bee12d0e0564ff2b90f6b00462 --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant.py @@ -0,0 +1,326 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +from typing import Optional + +from ._ops import ops + + +def per_block_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + BLKK: int = 64, + sm_scale: Optional[float] = None, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` and the key tensor `k` with per block quantization. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + sm_scale : Optional[float] + The scale factor for the softmax operation. Default is ``head_dim**-0.5``. + It will be multiplied by ``1.44269504`` to work together with the triton attention kernel. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32 + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + sm_scale *= 1.44269504 + + ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout) + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def per_warp_int8( + q: torch.Tensor, + k: torch.Tensor, + km: Optional[torch.Tensor] = None, + BLKQ: int = 128, + WARPQ: int = 32, + BLKK: int = 64, + tensor_layout: str = "HND", +): + """ + Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization. + Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128. + Block size of quantizing `k` is 64 or 128. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + km : Optional[torch.Tensor] + The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``. + Should be of the same dtype as `k` if provided. Default is None. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + A tuple containing: + - The quantized query tensor. Shape: Same as `q` but with `int8` dtype. + - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype. + - The quantized key tensor. Shape: Same as `k` but with `int8` dtype. + - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype. + + Note + ---- + - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + """ + + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + q_scale = torch.empty( + (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)), + device=q.device, + dtype=torch.float32, + ) + k_scale = torch.empty( + (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32 + ) + + ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout) + + if km is not None: + km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2) + ops.quant_per_block_int8_fuse_sub_mean_cuda( + k, km, k_int8, k_scale, BLKK, _tensor_layout + ) + else: + # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling + ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout) + + return q_int8, q_scale, k_int8, k_scale + + +def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"): + """ + Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor] + A tuple containing: + - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype. + - The returned mean tensor will have the same dtype as the input tensor. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + vm = v.mean(dim=1 if _tensor_layout == 0 else 2) + + v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device) + + # subtract mean and store the result as fp16 + ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout) + + return v_smoothed, vm + + +def per_channel_fp8( + v: torch.Tensor, + tensor_layout: str = "HND", + scale_max: float = 448.0, + smooth_v: bool = True, +): + """ + Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization. + `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64. + After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``. + The quantization is done per channel, with the scale value and smooth factor calculated per channel. + + Parameters + ---------- + v : torch.Tensor + The input tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + scale_max : float + The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format). + + smooth_v : bool + Whether to smooth the quantized tensor. Default is True. + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] + A tuple containing: + - The quantized tensor `v_fp8`. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype. + - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype. + + Note + ---- + - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16`` + - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``. + """ + + _tensor_layout = 0 if tensor_layout == "NHD" else 1 + + if tensor_layout == "HND": + b, h_kv, kv_len, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device + ) + + elif tensor_layout == "NHD": + b, kv_len, h_kv, head_dim = v.shape + padded_len = (kv_len + 63) // 64 * 64 + v_transposed_permutted = torch.empty( + (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device + ) + + ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout) + + v_fp8 = torch.empty( + v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device + ) + + v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device) + + if smooth_v: + ops.mean_scale_fuse_quant_cuda( + v_transposed_permutted, + v_fp8, + vm, + v_scale, + kv_len, + scale_max, + _tensor_layout, + ) + return v_fp8, v_scale, vm + else: + ops.scale_fuse_quant_cuda( + v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout + ) + return v_fp8, v_scale, None diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant_per_thread.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant_per_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..ab81f57c3e6dd9df89946ba54c4e2c3844c94d34 --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/quant_per_thread.py @@ -0,0 +1,204 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch +import triton +import triton.language as tl + +@triton.jit +def quant_query_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + # offs_k = tl.arange(0, C) + + # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + # x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + # x = x.to(tl.float32) + # scale = tl.max(tl.abs(x)) / 127. + 0.0000001 + # x_int8 = x / scale + # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + # x_int8 = x_int8.to(tl.int8) + # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + # tl.store(scale_ptrs, scale) + + offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1 + offs_k = tl.arange(0, C) + + input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :] + input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :] + output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :] + output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L) + x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L) + x0 = x0.to(tl.float32) + x1 = x1.to(tl.float32) + scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001 + x0_int8 = x0 / scale + x1_int8 = x1 / scale + x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1) + x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1) + x0_int8 = x0_int8.to(tl.int8) + x1_int8 = x1_int8.to(tl.int8) + tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L) + tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_query_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 8 + off_tld = tl.program_id(0) % 8 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +@triton.jit +def quant_key_per_thread_int4_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + C: tl.constexpr, BLK: tl.constexpr): + off_blk = tl.program_id(0) // 4 + off_tld = tl.program_id(0) % 4 + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2 + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + scale = tl.max(tl.abs(x)) / 7. + 0.0000001 + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + +def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"): + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if km is not None: + k = k - km + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1) + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32) + k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32) + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b) + quant_query_per_thread_int8_kernel[grid]( + q, q_int8, q_scale, qo_len, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_qo, stride_h_qo, stride_seq_qo, + q_scale.stride(0), q_scale.stride(1), + C=head_dim, BLK=WARPQ + ) + + grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b) + quant_key_per_thread_int8_kernel[grid]( + k, k_int8, k_scale, kv_len, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_ko, stride_h_ko, stride_seq_ko, + k_scale.stride(0), k_scale.stride(1), + C=head_dim, BLK=WARPK + ) + + return q_int8, q_scale, k_int8, k_scale \ No newline at end of file diff --git a/nix-build.log b/nix-build.log new file mode 100644 index 0000000000000000000000000000000000000000..6092aaedcfeaaa3a004fdf85a1e9fb8f117f5fda --- /dev/null +++ b/nix-build.log @@ -0,0 +1,13519 @@ +warning: Git tree '/home/ec2-user/dev/sage_attention' is dirty +warning: not writing modified lock file of flake 'git+file:///home/ec2-user/dev/sage_attention': +• Updated input 'kernel-builder': + 'github:huggingface/kernel-builder/967c94ec67830c5d85dc981407478939edd169f9?narHash=sha256-0EbrJkAx7yTOOjfJQFbk%2BBLo4MYfsD6JgRKibRYioo8%3D' (2025-09-25) + → 'github:huggingface/kernel-builder/9532ae833d245c03cb4daebd510e89e14cd27e7d?narHash=sha256-LYeNDsudfsy7extv59oyyirOv0%2BCG4hPIvTKnUaG7m0%3D' (2025-09-30) +evaluation warning: `rev` argument of `genFlakeOutputs` is deprecated, pass `self` as follows: + + kernel-builder.lib.genFlakeOutputs { + inherit self; + path = ./.; + }; +these 7 derivations will be built: + /nix/store/5i3gnhgvv278c7m9q3x3agksl5jab9ck-sage_attention-torch-ext.drv + /nix/store/bawig99wpvl8dvmdb3znykgir3w1nw15-sage_attention-torch-ext.drv + /nix/store/jzgwmpf18h1rrvfhclhq4say6m90j7y4-sage_attention-torch-ext.drv + /nix/store/msa3cr0rrgkm0dagqbcs67k8s169474b-sage_attention-torch-ext.drv + /nix/store/qckl1ak5l089b2sakw4h2whnd6mg16ld-sage_attention-torch-ext.drv + /nix/store/xq28asxbqp6g7x8bcz92xl849prg2899-torch-ext-bundle.drv + /nix/store/rkzh9xwk6kdgl1by4xfwmyvb5arpfqby-build-and-copy.drv +building '/nix/store/5i3gnhgvv278c7m9q3x3agksl5jab9ck-sage_attention-torch-ext.drv'... +building '/nix/store/bawig99wpvl8dvmdb3znykgir3w1nw15-sage_attention-torch-ext.drv'... +building '/nix/store/jzgwmpf18h1rrvfhclhq4say6m90j7y4-sage_attention-torch-ext.drv'... +building '/nix/store/msa3cr0rrgkm0dagqbcs67k8s169474b-sage_attention-torch-ext.drv'... +building '/nix/store/qckl1ak5l089b2sakw4h2whnd6mg16ld-sage_attention-torch-ext.drv'... +sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh +sage_attention-torch-ext> Sourcing setup-cuda-hook +sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh +sage_attention-torch-ext> Sourcing setup-cuda-hook +sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh +sage_attention-torch-ext> Sourcing setup-cuda-hook +sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh +sage_attention-torch-ext> Sourcing setup-cuda-hook +sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh +sage_attention-torch-ext> Sourcing setup-cuda-hook +sage_attention-torch-ext> Running phase: unpackPhase +sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source +sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Running phase: unpackPhase +sage_attention-torch-ext> Running phase: patchPhase +sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source +sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase +sage_attention-torch-ext> Running phase: unpackPhase +sage_attention-torch-ext> Running phase: patchPhase +sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source +sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase +sage_attention-torch-ext> Running phase: unpackPhase +sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> Running phase: patchPhase +sage_attention-torch-ext> fixing cmake files... +sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source +sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase +sage_attention-torch-ext> Running phase: unpackPhase +sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> Running phase: patchPhase +sage_attention-torch-ext> fixing cmake files... +sage_attention-torch-ext> unpacking source archive /nix/store/zgm080lkrxljczr1rfx3aa781rzxzc4p-source +sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase +sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> Running phase: patchPhase +sage_attention-torch-ext> fixing cmake files... +sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev\;/nix/store/bcvj4g3f3n6cpb6czcb5k8zdmyd94fwi-cuda_nvtx-12.6.77-lib\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/f87x0n0gi2d7rxh1ja92za2ixcw60q2p-cuda_nvtx-12.6.77\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase +sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> fixing cmake files... +sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev/include\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev/include\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev/include\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev/include\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev/include\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev/include\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev/include\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev/include\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev/include\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev/include\;/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev/include\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev/include -DCUDAToolkit_ROOT=/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86\;/nix/store/q2al0drhrl0yxk97xbsjl8d0h25kmsq9-libcurand-10.3.10.19-lib\;/nix/store/ax1ssn45048qbmyy19basgv6q64y5jy0-cuda_cupti-12.9.79\;/nix/store/m09542l6q83flp3asv2r4j3wcbjqksvg-libcufile-1.14.1.1-static\;/nix/store/b3wbcra9cziq8bwf3yhmj2nn1mf5bqy2-cuda_cudart-12.9.79-lib\;/nix/store/j5kp5fg9mn6hhslk18wbmskc7v96l353-cuda_cupti-12.9.79-static\;/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev\;/nix/store/3s79bz4ldkhlks6jf9a2jd4r34y6018b-libcurand-10.3.10.19\;/nix/store/v48xzq66pzmygxqkws17n9nvpa7lad9d-cuda_nvml_dev-12.9.79\;/nix/store/6via2axi1n31n685jii6dwaiqca8b2rc-cuda_nvcc-12.9.86-static\;/nix/store/v0hx9fqdlmz9kvjd9sqr2zc141ny10yn-cuda_profiler_api-12.9.79\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev\;/nix/store/8cig7k11qv5g8x0j8n2mbdfzwrnf7cg2-cuda_cudart-12.9.79-stubs\;/nix/store/xg8pj5m74n2h3v8kgxbvmbpcl90rzmlx-cudnn-9.11.0.98-static\;/nix/store/v4b7mkhyq1akczzkcyynj7y9c61l9dc7-cuda_cudart-12.9.79-static\;/nix/store/hw2swakbrvi4innrymcw8i2m98p73br0-cuda_cupti-12.9.79-sample\;/nix/store/s1i2kadnni2m4skpzzqzfzc3bpmrxi7p-libcusparse-12.5.10.65-lib\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev\;/nix/store/0a83zdhkh2i9d97r4zqdn8fi8vn4wfk3-libcublas-12.9.1.4-static\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev\;/nix/store/jnhjz87sm9nbnb72n54jj2l99szrzpg2-libcusparse-12.5.10.65\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev\;/nix/store/d1m6c5i6y6ncjygpdmv1b4pmd91hvjr2-cuda_cupti-12.9.79-lib\;/nix/store/49p6af3v11dcxvq9andr6l8csa2sr4j4-cuda_nvrtc-12.9.86-static\;/nix/store/bfygrgghga26l7br5d5j3h6hd1s21rkn-cudnn-9.11.0.98\;/nix/store/a6an9chi5dvjsybrfrxql0bn76xswzpa-libcufft-11.4.1.4\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev\;/nix/store/7zy91byrxpnyzhjlwham2gqyir2x6f54-libcusolver-11.7.5.82-lib\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev\;/nix/store/cx0hyla7fkqqc5hh1gn4hkarjyjvbjhf-libcusparse-12.5.10.65-static\;/nix/store/3yi8kx62nklnyn77zn4z23hi03l9c7ff-libcusolver-11.7.5.82-static\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev\;/nix/store/86nq76ks8vlgjdsnh1hkskyfw7mm3plc-cuda_cccl-12.9.27\;/nix/store/01ywykdxfkvp64318anifgx7zaavz9ql-cuda_nvml_dev-12.9.79-lib\;/nix/store/qv2m9i0nby2p03xx37mkkm84dlqb9s84-cuda_cudart-12.9.79\;/nix/store/a09saq5rl5jxbgv9gqllx0080ypjk00x-libcufile-1.14.1.1-lib\;/nix/store/0l18n4dhavr0p4rk0nyqqjr8paacak13-libcufile-1.14.1.1\;/nix/store/r8ly0w88qv4gw3lhd784ha0ag221c23s-cuda_nvrtc-12.9.86-lib\;/nix/store/rngn6cls1blhilrw78xb3pjgwghibhzk-libcurand-10.3.10.19-static\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev\;/nix/store/ikw7sqic4kknjkp50dr54khgs06q1hbv-cuda_nvml_dev-12.9.79-static\;/nix/store/bzdnjn29xj8a73wg16qrz0sswi9svp0x-libcublas-12.9.1.4\;/nix/store/62hqkwasnanq5i1j63z4clc0s4c61k1r-libcufft-11.4.1.4-static\;/nix/store/5sjldyn2vmm4ky24v1f9ggs0hps496q3-libcusolver-11.7.5.82\;/nix/store/9c924z3749bfm078bwq4ad12kjz46pjf-libcufft-11.4.1.4-lib\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev\;/nix/store/c1kdvq8xqqkwzzazl99w20h4x9z0f9pc-libcublas-12.9.1.4-lib\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev\;/nix/store/h6kzw3gvlv4sa0apb4fflpjlirhj72ga-cudnn-9.11.0.98-lib\;/nix/store/f5gvpjis5y727lw6vzr2h1zkb3hm08k2-cuda_nvrtc-12.9.86 -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> fixing cmake files... +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/5f6dvklv5d0mvygrrf0vzp0smcn7kk01-cuda_nvtx-12.8.90\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/wa9pr3485k3mw8jhv7i9kfzjrqmdl5bb-cuda_nvtx-12.8.90-lib\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0 +sage_attention-torch-ext> -- Detecting CXX compiler ABI info +sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0 +sage_attention-torch-ext> -- Detecting CXX compiler ABI info +sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0 +sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0 +sage_attention-torch-ext> -- Detecting CXX compiler ABI info +sage_attention-torch-ext> -- Detecting CXX compiler ABI info +sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0 +sage_attention-torch-ext> -- Detecting CXX compiler ABI info +sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done +sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done +sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done +sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done +sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ - skipped +sage_attention-torch-ext> -- Detecting CXX compile features +sage_attention-torch-ext> -- Detecting CXX compile features - done +sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ - skipped +sage_attention-torch-ext> -- Detecting CXX compile features +sage_attention-torch-ext> -- Detecting CXX compile features - done +sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done +sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped +sage_attention-torch-ext> -- Detecting CXX compile features +sage_attention-torch-ext> -- Detecting CXX compile features - done +sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped +sage_attention-torch-ext> -- Detecting CXX compile features +sage_attention-torch-ext> -- Detecting CXX compile features - done +sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps +sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped +sage_attention-torch-ext> -- Detecting CXX compile features +sage_attention-torch-ext> -- Detecting CXX compile features - done +sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps +sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps +sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps +sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps +sage_attention-torch-ext> -- Found Python: /nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed +sage_attention-torch-ext> -- Found Python: /nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed +sage_attention-torch-ext> -- Found Python: /nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed +sage_attention-torch-ext> -- Found Python: /nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed +sage_attention-torch-ext> -- Found Python: /nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed +sage_attention-torch-ext> -- Found CUDA: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 (found version "12.6") +sage_attention-torch-ext> -- Found CUDA: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86 (found version "12.9") +sage_attention-torch-ext> -- Found CUDA: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 (found version "12.6") +sage_attention-torch-ext> -- Found CUDA: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 (found version "12.8") +sage_attention-torch-ext> -- Found CUDA: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 (found version "12.8") +sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.6.85 with host compiler GNU 13.4.0 +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info +sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.9.86 with host compiler GNU 14.3.0 +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info +sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.6.85 with host compiler GNU 13.4.0 +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info +sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.8.93 with host compiler GNU 14.3.0 +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info +sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.8.93 with host compiler GNU 14.3.0 +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done +sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc - skipped +sage_attention-torch-ext> -- Detecting CUDA compile features +sage_attention-torch-ext> -- Detecting CUDA compile features - done +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include (found version "12.6.85") +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD +sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/bin/nvcc - skipped +sage_attention-torch-ext> -- Detecting CUDA compile features +sage_attention-torch-ext> -- Detecting CUDA compile features - done +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include (found version "12.9.86") +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD +sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc - skipped +sage_attention-torch-ext> -- Detecting CUDA compile features +sage_attention-torch-ext> -- Detecting CUDA compile features - done +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done +sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc - skipped +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include (found version "12.6.85") +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD +sage_attention-torch-ext> -- Detecting CUDA compile features +sage_attention-torch-ext> -- Detecting CUDA compile features - done +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed +sage_attention-torch-ext> -- Looking for pthread_create in pthreads +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include (found version "12.8.93") +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed +sage_attention-torch-ext> -- Looking for pthread_create in pthreads +sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc - skipped +sage_attention-torch-ext> -- Detecting CUDA compile features +sage_attention-torch-ext> -- Detecting CUDA compile features - done +sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found +sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include (found version "12.8.93") +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed +sage_attention-torch-ext> -- Looking for pthread_create in pthreads +sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found +sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed +sage_attention-torch-ext> -- Looking for pthread_create in pthreads +sage_attention-torch-ext> -- Looking for pthread_create in pthread - found +sage_attention-torch-ext> -- Found Threads: TRUE +sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found +sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed +sage_attention-torch-ext> -- Looking for pthread_create in pthreads +sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found +sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Looking for pthread_create in pthread - found +sage_attention-torch-ext> -- Found Threads: TRUE +sage_attention-torch-ext> -- Looking for pthread_create in pthread - found +sage_attention-torch-ext> -- Found Threads: TRUE +sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found +sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Looking for pthread_create in pthread - found +sage_attention-torch-ext> -- Found Threads: TRUE +sage_attention-torch-ext> -- Looking for pthread_create in pthread - found +sage_attention-torch-ext> -- Found Threads: TRUE +sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.6 +sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc +sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 +sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.9 +sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/bin/nvcc +sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86 +sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.6 +sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc +sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 +sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8 +sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc +sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.6 +sage_attention-torch-ext> -- Found Python: /nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter +sage_attention-torch-ext> CMake Warning at /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): +sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support +sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support +sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support +sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support +sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90 +sage_attention-torch-ext> CMake Warning at /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): +sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Found Torch: /nix/store/pg32mpjmckfs38anjzgyvk2ljfw12pb3-python3.13-torch-2.8.0-lib/lib/libtorch.so +sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 +sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.9 +sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8 +sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc +sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 +sage_attention-torch-ext> -- Found Python: /nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter +sage_attention-torch-ext> CMake Warning at /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): +sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support +sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support +sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support +sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support +sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.6 +sage_attention-torch-ext> CMake Warning at /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): +sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Found Torch: /nix/store/zccgvlbr93bhyia3sr9f2mddmkp2jyx7-python3.13-torch-2.8.0-lib/lib/libtorch.so +sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.8 +sage_attention-torch-ext> -- Found Python: /nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter +sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): +sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> CMake Warning (dev) at /nix/store/0vnarm4qjnj16dr3zj9kwq6bn79c0icn-cmake-3.31.7/share/cmake-3.31/Modules/FindPackageHandleStandardArgs.cmake:441 (message): +sage_attention-torch-ext> The package name passed to `find_package_handle_standard_args` (nvtx3) does +sage_attention-torch-ext> not match the name of the calling package (Caffe2). This can lead to +sage_attention-torch-ext> problems in calling code that expects `find_package` result variables +sage_attention-torch-ext> (e.g., `_FOUND`) to follow a certain pattern. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:184 (find_package_handle_standard_args) +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> This warning is for project developers. Use -Wno-dev to suppress it. +sage_attention-torch-ext> +sage_attention-torch-ext> -- Could NOT find nvtx3 (missing: nvtx3_dir) +sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:190 (message): +sage_attention-torch-ext> Cannot find NVTX3, find old NVTX instead +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support +sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support +sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support +sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support +sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/utils.cmake:328 (message): +sage_attention-torch-ext> In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST +sage_attention-torch-ext> to cmake instead of implicitly setting it as an env variable. This will +sage_attention-torch-ext> become a FATAL_ERROR in future version of pytorch. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:337 (torch_cuda_get_nvcc_gencode_flag) +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90 +sage_attention-torch-ext> -- Found Python: /nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter +sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): +sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support +sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support +sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support +sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support +sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120 +sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): +sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Found Torch: /nix/store/8sicfhvzq84gnxiwybyjgp80pcynamzn-python3.13-torch-2.7.1-lib/lib/libtorch.so +sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 +sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 +sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): +sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Found Torch: /nix/store/mrq1wi2biib2p1mks17g8g5sc4fd492r-python3.13-torch-2.8.0-lib/lib/libtorch.so +sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.8 +sage_attention-torch-ext> -- Found Python: /nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter +sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): +sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> CMake Warning (dev) at /nix/store/0vnarm4qjnj16dr3zj9kwq6bn79c0icn-cmake-3.31.7/share/cmake-3.31/Modules/FindPackageHandleStandardArgs.cmake:441 (message): +sage_attention-torch-ext> The package name passed to `find_package_handle_standard_args` (nvtx3) does +sage_attention-torch-ext> not match the name of the calling package (Caffe2). This can lead to +sage_attention-torch-ext> problems in calling code that expects `find_package` result variables +sage_attention-torch-ext> (e.g., `_FOUND`) to follow a certain pattern. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:184 (find_package_handle_standard_args) +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> This warning is for project developers. Use -Wno-dev to suppress it. +sage_attention-torch-ext> +sage_attention-torch-ext> -- Could NOT find nvtx3 (missing: nvtx3_dir) +sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:190 (message): +sage_attention-torch-ext> Cannot find NVTX3, find old NVTX instead +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support +sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support +sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support +sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support +sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/utils.cmake:328 (message): +sage_attention-torch-ext> In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST +sage_attention-torch-ext> to cmake instead of implicitly setting it as an env variable. This will +sage_attention-torch-ext> become a FATAL_ERROR in future version of pytorch. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:337 (torch_cuda_get_nvcc_gencode_flag) +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120 +sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): +sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Found Torch: /nix/store/35sj4in2ddx47klyg96qmkpd4vh8py94-python3.13-torch-2.7.1-lib/lib/libtorch.so +sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a +sage_attention-torch-ext> -- Configuring done (9.3s) +sage_attention-torch-ext> -- Generating done (0.0s) +sage_attention-torch-ext> CMake Warning: +sage_attention-torch-ext> Manually-specified variables were not used by the project: +sage_attention-torch-ext> +sage_attention-torch-ext> BUILD_TESTING +sage_attention-torch-ext> CMAKE_EXPORT_NO_PACKAGE_REGISTRY +sage_attention-torch-ext> CMAKE_INSTALL_BINDIR +sage_attention-torch-ext> CMAKE_INSTALL_DOCDIR +sage_attention-torch-ext> CMAKE_INSTALL_INCLUDEDIR +sage_attention-torch-ext> CMAKE_INSTALL_INFODIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBDIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBEXECDIR +sage_attention-torch-ext> CMAKE_INSTALL_LOCALEDIR +sage_attention-torch-ext> CMAKE_INSTALL_MANDIR +sage_attention-torch-ext> CMAKE_INSTALL_SBINDIR +sage_attention-torch-ext> CMAKE_POLICY_DEFAULT_CMP0025 +sage_attention-torch-ext> CUDAToolkit_INCLUDE_DIR +sage_attention-torch-ext> PROTOC_EXE +sage_attention-torch-ext> PYBIND11_PYTHONLIBS_OVERWRITE +sage_attention-torch-ext> PYTHON_EXECUTABLE +sage_attention-torch-ext> PYTHON_INCLUDE_DIR +sage_attention-torch-ext> PYTHON_SITE_PACKAGES +sage_attention-torch-ext> Protobuf_PROTOC_EXE +sage_attention-torch-ext> Protobuf_PROTOC_EXECUTABLE +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Build files have been written to: /build/source/build +sage_attention-torch-ext> cmake: enabled parallel building +sage_attention-torch-ext> cmake: enabled parallel installing +sage_attention-torch-ext> Running phase: buildPhase +sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Configuring done (9.5s) +sage_attention-torch-ext> -- Generating done (0.0s) +sage_attention-torch-ext> CMake Warning: +sage_attention-torch-ext> Manually-specified variables were not used by the project: +sage_attention-torch-ext> +sage_attention-torch-ext> BUILD_TESTING +sage_attention-torch-ext> CMAKE_EXPORT_NO_PACKAGE_REGISTRY +sage_attention-torch-ext> CMAKE_INSTALL_BINDIR +sage_attention-torch-ext> CMAKE_INSTALL_DOCDIR +sage_attention-torch-ext> CMAKE_INSTALL_INCLUDEDIR +sage_attention-torch-ext> CMAKE_INSTALL_INFODIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBDIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBEXECDIR +sage_attention-torch-ext> CMAKE_INSTALL_LOCALEDIR +sage_attention-torch-ext> CMAKE_INSTALL_MANDIR +sage_attention-torch-ext> CMAKE_INSTALL_SBINDIR +sage_attention-torch-ext> CMAKE_POLICY_DEFAULT_CMP0025 +sage_attention-torch-ext> CUDAToolkit_INCLUDE_DIR +sage_attention-torch-ext> PROTOC_EXE +sage_attention-torch-ext> PYBIND11_PYTHONLIBS_OVERWRITE +sage_attention-torch-ext> PYTHON_EXECUTABLE +sage_attention-torch-ext> PYTHON_INCLUDE_DIR +sage_attention-torch-ext> PYTHON_SITE_PACKAGES +sage_attention-torch-ext> Protobuf_PROTOC_EXE +sage_attention-torch-ext> Protobuf_PROTOC_EXECUTABLE +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Build files have been written to: /build/source/build +sage_attention-torch-ext> cmake: enabled parallel building +sage_attention-torch-ext> cmake: enabled parallel installing +sage_attention-torch-ext> Running phase: buildPhase +sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a +sage_attention-torch-ext> -- Configuring done (9.6s) +sage_attention-torch-ext> -- Generating done (0.0s) +sage_attention-torch-ext> CMake Warning: +sage_attention-torch-ext> Manually-specified variables were not used by the project: +sage_attention-torch-ext> +sage_attention-torch-ext> BUILD_TESTING +sage_attention-torch-ext> CMAKE_EXPORT_NO_PACKAGE_REGISTRY +sage_attention-torch-ext> CMAKE_INSTALL_BINDIR +sage_attention-torch-ext> CMAKE_INSTALL_DOCDIR +sage_attention-torch-ext> CMAKE_INSTALL_INCLUDEDIR +sage_attention-torch-ext> CMAKE_INSTALL_INFODIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBDIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBEXECDIR +sage_attention-torch-ext> CMAKE_INSTALL_LOCALEDIR +sage_attention-torch-ext> CMAKE_INSTALL_MANDIR +sage_attention-torch-ext> CMAKE_INSTALL_SBINDIR +sage_attention-torch-ext> CMAKE_POLICY_DEFAULT_CMP0025 +sage_attention-torch-ext> CUDAToolkit_INCLUDE_DIR +sage_attention-torch-ext> PROTOC_EXE +sage_attention-torch-ext> PYBIND11_PYTHONLIBS_OVERWRITE +sage_attention-torch-ext> PYTHON_EXECUTABLE +sage_attention-torch-ext> PYTHON_INCLUDE_DIR +sage_attention-torch-ext> PYTHON_SITE_PACKAGES +sage_attention-torch-ext> Protobuf_PROTOC_EXE +sage_attention-torch-ext> Protobuf_PROTOC_EXECUTABLE +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Build files have been written to: /build/source/build +sage_attention-torch-ext> cmake: enabled parallel building +sage_attention-torch-ext> cmake: enabled parallel installing +sage_attention-torch-ext> Running phase: buildPhase +sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 +sage_attention-torch-ext> -- Configuring done (9.7s) +sage_attention-torch-ext> -- Generating done (0.0s) +sage_attention-torch-ext> CMake Warning: +sage_attention-torch-ext> Manually-specified variables were not used by the project: +sage_attention-torch-ext> +sage_attention-torch-ext> BUILD_TESTING +sage_attention-torch-ext> CMAKE_EXPORT_NO_PACKAGE_REGISTRY +sage_attention-torch-ext> CMAKE_INSTALL_BINDIR +sage_attention-torch-ext> CMAKE_INSTALL_DOCDIR +sage_attention-torch-ext> CMAKE_INSTALL_INCLUDEDIR +sage_attention-torch-ext> CMAKE_INSTALL_INFODIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBDIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBEXECDIR +sage_attention-torch-ext> CMAKE_INSTALL_LOCALEDIR +sage_attention-torch-ext> CMAKE_INSTALL_MANDIR +sage_attention-torch-ext> CMAKE_INSTALL_SBINDIR +sage_attention-torch-ext> CMAKE_POLICY_DEFAULT_CMP0025 +sage_attention-torch-ext> CUDAToolkit_INCLUDE_DIR +sage_attention-torch-ext> PROTOC_EXE +sage_attention-torch-ext> PYBIND11_PYTHONLIBS_OVERWRITE +sage_attention-torch-ext> PYTHON_EXECUTABLE +sage_attention-torch-ext> PYTHON_INCLUDE_DIR +sage_attention-torch-ext> PYTHON_SITE_PACKAGES +sage_attention-torch-ext> Protobuf_PROTOC_EXE +sage_attention-torch-ext> Protobuf_PROTOC_EXECUTABLE +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Build files have been written to: /build/source/build +sage_attention-torch-ext> cmake: enabled parallel building +sage_attention-torch-ext> cmake: enabled parallel installing +sage_attention-torch-ext> Running phase: buildPhase +sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a +sage_attention-torch-ext> -- Configuring done (9.8s) +sage_attention-torch-ext> -- Generating done (0.0s) +sage_attention-torch-ext> CMake Warning: +sage_attention-torch-ext> Manually-specified variables were not used by the project: +sage_attention-torch-ext> +sage_attention-torch-ext> BUILD_TESTING +sage_attention-torch-ext> CMAKE_EXPORT_NO_PACKAGE_REGISTRY +sage_attention-torch-ext> CMAKE_INSTALL_BINDIR +sage_attention-torch-ext> CMAKE_INSTALL_DOCDIR +sage_attention-torch-ext> CMAKE_INSTALL_INCLUDEDIR +sage_attention-torch-ext> CMAKE_INSTALL_INFODIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBDIR +sage_attention-torch-ext> CMAKE_INSTALL_LIBEXECDIR +sage_attention-torch-ext> CMAKE_INSTALL_LOCALEDIR +sage_attention-torch-ext> CMAKE_INSTALL_MANDIR +sage_attention-torch-ext> CMAKE_INSTALL_SBINDIR +sage_attention-torch-ext> CMAKE_POLICY_DEFAULT_CMP0025 +sage_attention-torch-ext> CUDAToolkit_INCLUDE_DIR +sage_attention-torch-ext> PROTOC_EXE +sage_attention-torch-ext> PYBIND11_PYTHONLIBS_OVERWRITE +sage_attention-torch-ext> PYTHON_EXECUTABLE +sage_attention-torch-ext> PYTHON_INCLUDE_DIR +sage_attention-torch-ext> PYTHON_SITE_PACKAGES +sage_attention-torch-ext> Protobuf_PROTOC_EXE +sage_attention-torch-ext> Protobuf_PROTOC_EXECUTABLE +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- Build files have been written to: /build/source/build +sage_attention-torch-ext> cmake: enabled parallel building +sage_attention-torch-ext> cmake: enabled parallel installing +sage_attention-torch-ext> Running phase: buildPhase +sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_44b112f_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 441.985 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 435.474 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 542.186 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.633 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 556.850 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 558.532 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 525.842 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 527.640 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 453.136 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 455.551 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 560.086 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 566.771 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.886 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 452.820 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 563.157 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 552.506 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 223.644 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 223.313 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.182 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.007 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 221.210 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 221.780 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.300 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 227.566 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.498 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.960 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.012 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.548 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.845 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.634 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.408 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.002 ms +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 586.615 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 588.685 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 592.250 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 593.767 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 574.520 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.749 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 586.361 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 582.063 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 522.972 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 516.314 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 629.345 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 626.246 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 502.294 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 503.470 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 619.844 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 616.534 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 238.306 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.621 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.968 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.797 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.832 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.605 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.794 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.869 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.329 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.374 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.911 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.832 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.277 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.914 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 272.102 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.136 ms +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 52.664 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 47.716 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 41.273 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 38.776 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.406 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.111 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.007 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.035 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 6.928 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 6.485 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 5.804 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 5.653 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.750 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.113 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.936 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.215 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.871 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.819 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.988 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.837 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 20.280 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 14.116 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.736 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.448 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 18.929 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.647 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.319 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.264 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 17.227 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.880 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.378 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.197 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 16.636 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.659 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.735 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.882 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 17.822 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.403 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.426 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.251 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 18.305 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.513 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.229 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.345 ms +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.466 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 20.754 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 16.622 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 16.662 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.590 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.369 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.286 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.288 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.194 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.170 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.785 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.792 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.000 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.105 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.117 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.127 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.123 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.698 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.295 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.133 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.679 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.073 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.101 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.066 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.673 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.110 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.077 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.057 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.517 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.116 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.066 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.118 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.597 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.028 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.165 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.086 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.971 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.258 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.246 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.285 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.975 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.326 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.259 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.217 ms +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 29.770 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 27.921 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.255 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.250 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.477 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.404 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.402 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.398 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.334 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.262 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.805 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.876 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.098 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.180 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.230 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.119 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.241 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.773 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.295 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.169 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.442 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.177 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.164 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.109 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.826 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.233 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.210 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.194 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.782 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.427 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.885 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.938 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.257 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.465 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.747 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.217 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.086 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.344 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.296 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.133 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.429 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.303 ms +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 631.269 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 678.510 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 889.763 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 842.414 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 848.507 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 860.566 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 852.066 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 857.813 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 798.712 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 814.182 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 766.806 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 563.761 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.277 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 462.993 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 565.805 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 557.043 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 226.177 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 223.149 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.610 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 229.459 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 224.016 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 223.754 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 231.030 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 231.032 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.149 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.080 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.757 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.788 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.808 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.484 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.288 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.649 ms +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 798.894 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 793.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 737.636 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 579.885 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 582.862 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 569.953 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 564.402 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 562.262 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 487.942 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 539.907 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 618.986 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 629.778 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.290 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 494.479 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 609.734 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 599.355 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.366 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.087 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.448 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.353 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.158 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.570 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.500 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.915 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.784 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.127 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.930 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 284.631 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 319.602 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.218 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.477 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 266.415 ms +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 991.616 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 805.593 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 587.577 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 582.580 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 861.558 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.405 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 570.518 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 572.015 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 501.662 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 502.550 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 621.856 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 617.546 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 506.660 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 500.017 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 617.184 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 608.361 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.249 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.320 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 294.735 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.622 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.522 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.401 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.182 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.771 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 275.067 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 272.300 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 276.028 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 278.538 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 272.368 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.965 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 278.318 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 276.783 ms +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 950.424 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 938.339 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 942.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 951.773 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 941.110 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 948.921 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 953.087 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 937.847 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 796.084 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 790.276 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1005.200 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1014.114 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 792.773 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 810.731 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1016.541 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1015.399 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 512.495 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 519.312 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 459.927 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 403.606 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 512.221 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 498.943 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.517 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 448.368 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 558.783 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 556.302 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 552.468 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 548.704 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 549.656 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 548.578 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.915 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.169 ms +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 33.055 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 29.879 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 25.412 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 24.254 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.621 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.517 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.419 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.453 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 4.394 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 4.362 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 3.675 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 3.641 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.795 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.833 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.808 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.779 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.779 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.693 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.811 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.727 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.304 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.983 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.975 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.919 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.034 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.806 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.660 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.769 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.120 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.379 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.839 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.843 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 10.634 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.796 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.711 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.701 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.606 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.199 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.214 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.120 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.220 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.031 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.065 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.001 ms +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 32.115 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 30.290 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 23.510 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 24.156 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.940 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.758 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.801 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.800 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.554 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.284 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.852 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.773 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.896 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.918 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.213 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.558 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.731 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.301 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.800 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.677 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 15.202 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.401 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.767 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.408 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 15.133 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.391 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.290 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.324 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.323 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.568 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.806 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.917 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.234 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.500 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.014 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.819 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.673 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.071 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.839 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.729 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.484 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.911 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.806 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.552 ms +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 30.458 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 28.607 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.856 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.959 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.522 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.464 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.487 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.405 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.377 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.360 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.899 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.911 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.329 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.396 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.795 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.739 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.421 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.937 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.493 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.361 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.107 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.381 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.306 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.143 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.281 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.424 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.406 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.362 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.822 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.583 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.380 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.357 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.895 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.353 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.400 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.508 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.780 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.433 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.559 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.439 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.336 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.564 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.563 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.546 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 254.401 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 258.197 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 292.728 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 288.496 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 259.609 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 257.610 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 310.313 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 261.558 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 305.399 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 290.890 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 321.909 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 268.549 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 305.193 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 323.257 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 306.249 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 328.274 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 198.513 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 195.701 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 203.394 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 206.831 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 188.040 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 177.604 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 134.328 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.982 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.759 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.951 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 117.419 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 119.115 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.767 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.382 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 117.939 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 118.428 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 161.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.059 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.835 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 165.521 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.573 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.492 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.714 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.815 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 171.328 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 166.435 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 172.348 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 172.724 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.493 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 166.818 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 173.286 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 173.009 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.165 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.053 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.071 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.342 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.804 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.230 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 111.573 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.108 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.093 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.517 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 117.870 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 118.980 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 114.139 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.256 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 116.641 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 115.081 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 964.900 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 958.181 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 958.371 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 958.394 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 939.503 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 701.023 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.680 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 801.568 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 813.002 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 808.011 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1016.111 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 977.034 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 754.749 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 731.276 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 923.620 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 940.844 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 459.245 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 457.206 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.179 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 450.787 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 447.327 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 431.471 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 506.880 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 527.163 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 553.846 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 551.678 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.979 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 495.100 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 483.901 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 477.401 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.705 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 515.510 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 966.091 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 961.337 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 972.006 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 974.650 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 957.506 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 960.363 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 971.507 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 970.596 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 807.611 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 802.233 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1033.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1024.056 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 810.552 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 796.703 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1004.351 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1012.638 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 510.266 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 512.618 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 521.439 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 524.050 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 512.244 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 508.562 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 529.261 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 519.651 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 547.796 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 543.208 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 560.784 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 556.368 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 546.293 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 550.874 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 558.939 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 549.771 ms +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 551.911 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 541.946 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 553.795 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 561.058 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 548.291 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.796 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 562.109 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 552.458 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 463.777 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 463.453 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 583.598 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 589.296 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 465.519 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 466.966 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 594.830 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 589.614 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 304.504 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 292.156 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.004 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.724 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 304.503 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 297.500 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.810 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.907 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 539.805 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.085 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 547.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 539.854 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 538.118 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 518.144 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 519.476 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 509.056 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 250.137 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 245.084 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 267.574 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 296.208 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 260.850 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 249.064 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 270.699 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 286.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 272.624 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 307.901 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 280.865 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 303.936 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 258.298 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 294.248 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 323.737 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 295.934 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 198.053 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 192.406 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 198.137 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 196.772 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 191.623 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 191.442 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 179.258 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.342 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.085 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.712 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 115.319 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 115.636 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.774 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.079 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.757 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.820 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 154.069 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 153.781 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 157.918 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 158.807 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 153.605 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 152.656 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 157.839 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 158.870 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.435 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.455 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 165.937 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 165.331 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 161.712 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.096 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.483 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 166.942 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.455 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.423 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.992 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.119 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.173 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.337 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.419 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.601 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.895 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 120.216 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 114.583 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.804 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.320 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 111.462 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.934 ms +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 589.821 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 586.041 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 491.060 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 494.720 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 773.047 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 965.297 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 969.303 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 953.702 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 799.895 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 791.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 980.682 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 968.691 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 796.989 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 785.876 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 925.652 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 925.953 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 387.605 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 400.010 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 415.420 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 415.903 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 405.187 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 402.514 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 413.632 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 364.884 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 396.414 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 379.967 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 395.708 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 397.412 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 403.622 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 419.375 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 421.370 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 399.024 ms +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 788.068 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 796.846 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 982.791 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 992.436 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 967.039 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 955.511 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 948.729 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 947.350 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 804.728 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 688.548 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 604.303 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 591.514 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 478.180 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 474.254 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 585.374 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 575.116 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.784 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 233.797 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 237.642 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 238.282 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.575 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 229.730 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.005 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.815 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.679 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.601 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.298 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.797 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.296 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 286.895 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 373.298 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 431.989 ms +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 610.185 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 602.678 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 505.530 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 514.490 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 580.128 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 578.117 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 578.034 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 579.730 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 504.559 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 508.363 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 639.247 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 638.366 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 794.307 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 870.622 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1100.127 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1076.237 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 392.309 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 410.184 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 419.790 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 423.421 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 409.855 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 405.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 415.085 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 417.644 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 420.401 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 410.354 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 458.834 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.341 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 454.346 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 453.572 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 464.660 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 450.779 ms +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1026.913 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1039.187 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1035.649 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1034.679 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 943.433 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 603.724 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 608.512 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 615.357 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 592.052 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 855.749 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1111.245 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1130.304 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 884.960 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 906.424 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1103.580 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1094.002 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 401.149 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 391.095 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 442.479 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 440.891 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 428.954 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 437.536 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 459.124 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 442.354 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 462.155 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 471.281 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 489.585 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 469.265 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.902 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 457.616 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 473.635 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 469.484 ms +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 984.592 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 990.417 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1003.978 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 991.593 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 973.002 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 963.093 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 974.741 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 970.046 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 829.686 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 778.324 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 814.682 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 994.113 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 826.298 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 831.656 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 996.818 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1000.920 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 406.501 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 410.054 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 419.738 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 419.590 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 381.081 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 371.843 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 422.365 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 424.444 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.547 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 454.437 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 464.828 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 448.633 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 452.369 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 477.124 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 455.888 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 448.456 ms +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 28 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.799 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.299 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.071 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.305 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.413 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.845 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 166.829 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 170.956 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 171.643 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 171.202 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 169.275 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.832 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.102 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.108 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.067 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.579 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.599 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.308 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.308 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 101.930 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 101.802 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.487 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.022 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.688 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.189 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.518 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.481 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.660 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.254 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.380 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 111.712 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 151.116 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 150.731 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 155.635 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.071 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 150.923 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 151.125 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 155.642 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 155.914 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 158.104 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.048 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.044 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.951 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.584 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.201 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.848 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.423 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 100.711 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 100.134 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.477 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.970 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 99.158 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 100.406 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.484 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.449 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.326 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.221 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.746 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.144 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.226 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.365 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.624 ms +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 428.455 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 429.635 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 519.790 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 516.317 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 507.478 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 514.600 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 509.573 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 505.110 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 446.349 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 448.854 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 546.233 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.883 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 446.249 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 448.848 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 543.742 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.598 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 222.762 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 223.708 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.874 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 230.260 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 221.065 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 221.723 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 226.020 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 226.397 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.851 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.220 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.410 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.724 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.573 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.137 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.431 ms +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 448.461 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 445.120 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 531.424 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 528.424 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 530.435 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 528.854 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 530.439 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 532.033 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 462.295 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 464.516 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 569.408 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.447 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 462.865 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 463.457 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 560.896 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 561.562 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.511 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 232.415 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 238.152 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.639 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 233.681 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 231.057 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.930 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 233.246 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.219 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.739 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.486 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.085 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.876 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.286 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.920 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.212 ms +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 556.352 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 553.921 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 467.748 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 468.780 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 530.531 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 533.354 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 536.300 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.186 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 473.500 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 479.248 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 593.492 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 591.683 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.066 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 493.646 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 592.271 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 592.168 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 233.115 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 226.961 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.252 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.465 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 229.104 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 226.404 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 233.070 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.435 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.171 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.891 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.335 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.943 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.335 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.577 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.777 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 548.097 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 531.494 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 541.595 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 541.311 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 543.318 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.357 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.372 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.675 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 465.859 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 457.031 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.667 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.213 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.097 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.269 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.915 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 284.879 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 283.571 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.400 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.976 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 287.723 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 285.951 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.519 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.311 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 309.154 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 307.970 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 319.757 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 317.137 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 308.197 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 307.759 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 312.691 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 312.052 ms +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.453 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.150 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 574.691 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 577.777 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 555.916 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.288 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 564.822 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 565.896 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 519.489 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 509.876 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 608.774 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 610.988 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.281 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 497.232 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 598.825 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 597.024 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.767 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.665 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.634 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.912 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.834 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.000 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.898 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.002 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.691 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.304 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 264.469 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 264.533 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.612 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.095 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.757 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 266.410 ms +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.052 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 569.894 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 575.749 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 576.139 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 565.428 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 565.432 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 570.960 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 567.129 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 479.405 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 481.992 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 599.426 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 600.968 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 487.144 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 482.863 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 597.829 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 599.207 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 304.359 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 303.989 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 310.315 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 310.421 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 302.690 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 300.923 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 310.178 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 312.938 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 328.845 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 327.077 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 333.822 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 333.667 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 327.900 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 327.791 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 334.298 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 333.854 ms +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 582.117 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 544.992 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 584.988 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 555.232 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 454.499 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 439.407 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 572.154 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 542.048 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 585.747 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 559.584 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 597.075 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.451 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 588.464 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 558.150 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 596.007 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 567.032 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 243.884 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.015 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.009 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.923 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.981 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.514 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.371 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.704 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 266.321 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.617 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 284.004 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 275.143 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 275.742 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 267.119 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 284.730 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 276.077 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 197.321 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 197.197 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 206.997 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 213.504 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 217.085 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 217.001 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 222.448 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 216.447 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 222.938 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 220.112 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 225.064 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 225.100 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 221.324 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 220.230 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 224.575 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 224.262 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.648 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.222 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 272.418 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 271.543 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.082 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.818 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.600 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 271.513 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 285.432 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 284.214 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 290.738 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 290.681 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 281.864 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 289.314 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 298.218 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 297.861 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 547.650 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 515.155 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 548.305 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 522.745 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 426.900 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 405.909 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 539.708 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 512.176 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 555.106 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 529.515 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 566.544 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 539.557 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.977 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 526.419 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 565.716 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.362 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.289 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.398 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.279 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.625 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.500 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.725 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.759 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.854 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 261.146 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 267.309 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.554 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.758 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.118 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 267.376 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.264 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 477.204 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 471.498 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 485.235 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 491.403 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 469.169 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 472.946 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 472.230 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 471.596 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 487.877 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 488.877 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 499.892 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 500.543 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 490.862 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 488.826 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 493.915 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 493.916 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.825 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.184 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.799 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.603 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.420 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.985 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.939 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.712 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 264.742 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.494 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 272.510 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 273.021 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 264.256 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.618 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 271.779 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 273.684 ms +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 37 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/6jmb39fj6a2hjg90bjmrxna5vkivwy8n-sage_attention-torch-ext +sage_attention-torch-ext> Running phase: installCheckPhase +sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing +sage_attention-torch-ext> Checking of ABI compatibility +sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 +sage_attention-torch-ext> ✅ No compatibility issues found +sage_attention-torch-ext> Checking loading kernel with get_kernel +sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 42 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/794gpxvpsn0jkzi2zndpd6i6nhspwid2-sage_attention-torch-ext +sage_attention-torch-ext> Running phase: installCheckPhase +sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing +sage_attention-torch-ext> Checking of ABI compatibility +sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 +sage_attention-torch-ext> ✅ No compatibility issues found +sage_attention-torch-ext> Checking loading kernel with get_kernel +sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 584.776 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 546.615 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 588.242 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 557.461 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 457.510 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 442.285 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 574.289 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 545.839 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 590.187 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 564.103 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 601.630 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 571.288 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 590.861 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 558.525 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 597.586 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.195 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.563 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 237.397 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 267.101 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.641 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.210 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.585 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.182 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.223 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.912 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.454 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 285.952 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 276.613 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 277.856 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.776 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 286.969 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 277.383 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 198.436 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 197.677 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 208.144 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 215.222 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 217.948 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 217.815 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 224.268 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 217.566 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 222.772 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 222.426 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 226.758 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 227.240 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 221.637 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 221.849 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 226.190 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 226.639 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 266.983 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 267.092 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 273.940 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 273.960 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.620 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.488 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 274.226 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 274.002 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 286.593 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 286.454 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 293.260 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 293.760 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 284.209 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 293.118 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 300.652 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 300.845 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 549.938 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 517.048 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 550.878 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 524.674 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 430.526 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 409.659 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 542.771 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 515.247 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 559.112 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 533.754 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 571.632 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 545.216 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 559.200 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 533.626 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 571.547 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 544.504 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.386 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 238.228 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.757 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 243.618 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.223 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 237.605 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.583 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.237 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.119 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.398 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.412 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 261.949 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.559 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.463 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 271.281 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.448 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 483.972 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 478.741 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 492.247 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 499.507 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 478.163 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 480.421 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 478.013 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 479.704 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 492.647 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.343 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 506.130 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 505.690 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 499.671 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.812 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 507.126 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 505.885 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.743 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.403 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.148 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.065 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.576 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.934 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.861 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.579 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.519 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.729 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 277.652 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 277.765 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.882 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.865 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 276.602 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 276.939 ms +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 45 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/x0vcv18dr2mcj5ih9i3aq3nshydimpca-sage_attention-torch-ext +sage_attention-torch-ext> Running phase: installCheckPhase +sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing +sage_attention-torch-ext> Checking of ABI compatibility +sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 +sage_attention-torch-ext> ✅ No compatibility issues found +sage_attention-torch-ext> Checking loading kernel with get_kernel +sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.842 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.877 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 550.018 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 548.929 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 536.450 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.209 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 534.340 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 534.521 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 471.004 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 472.920 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.035 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 571.750 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 469.703 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 473.748 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 571.073 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 567.513 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.378 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.776 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.118 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.920 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.965 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.354 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.999 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.139 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.654 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.864 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.515 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.248 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.341 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.405 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.494 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.957 ms +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 49 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/wqplrwp0m14wqjzmh3x18a5pgc2kcs8k-sage_attention-torch-ext +sage_attention-torch-ext> Running phase: installCheckPhase +sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing +sage_attention-torch-ext> Checking of ABI compatibility +sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 +sage_attention-torch-ext> ✅ No compatibility issues found +sage_attention-torch-ext> Checking loading kernel with get_kernel +sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 33.289 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 29.922 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 25.506 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 24.229 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.507 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.402 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.284 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 4.249 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 4.153 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 4.124 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 3.441 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 3.396 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.558 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.568 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.634 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.541 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.451 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.412 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.383 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.331 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.259 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.726 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.702 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.676 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.785 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.582 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.636 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.604 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.001 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.682 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.202 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.571 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 10.582 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.640 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.583 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.689 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.382 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.123 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.179 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.029 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.805 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.019 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 8.021 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.987 ms +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.199 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 20.236 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 15.886 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 15.927 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.053 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.023 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.037 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.014 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.934 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.911 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.539 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.515 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.825 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.758 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.799 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.780 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.809 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.821 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.860 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.802 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.542 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.751 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.764 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.734 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.465 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.805 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.770 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.803 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.826 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.861 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.836 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.775 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.283 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.797 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.852 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.792 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.622 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.937 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.951 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.403 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.823 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.956 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.932 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.920 ms +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 29.989 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 27.743 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 21.940 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.046 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.158 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.085 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.102 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.056 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.995 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.998 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.554 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.556 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.914 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.864 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.853 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.860 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.920 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.835 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.938 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.867 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.520 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.801 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.806 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.807 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.550 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.925 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.436 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.396 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.844 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.907 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.863 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.293 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.874 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.941 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.908 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.847 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.972 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.040 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.524 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.966 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 6.990 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.034 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.001 ms +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_44b112f_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 545.701 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 516.375 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 549.037 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 527.582 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 465.316 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 449.153 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 472.189 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 447.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 487.618 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 465.347 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 567.223 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 547.253 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 492.125 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 465.637 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 497.336 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 474.373 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.373 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 223.327 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.079 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.504 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.967 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.799 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.238 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.402 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.456 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.567 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 278.934 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.049 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.213 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.206 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 277.538 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.246 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 199.821 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 199.606 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 209.401 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 210.623 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 211.729 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 211.557 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 210.827 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 209.703 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 216.049 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 216.295 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 221.361 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 222.072 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 216.433 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 216.431 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 219.990 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 220.584 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.976 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.879 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.195 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 265.276 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.031 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.054 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.473 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.351 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 275.256 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 273.759 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 281.644 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 282.248 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 274.738 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 274.730 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 282.169 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 282.613 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 445.454 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 429.332 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 530.361 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 435.991 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 444.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 421.719 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 512.094 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 488.322 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 461.929 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 440.182 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 472.256 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 449.929 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.100 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 437.509 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 471.040 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 449.202 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 237.683 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.527 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.745 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 232.871 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.932 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 227.876 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.745 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.589 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.400 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.344 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.897 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.860 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.826 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.663 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 261.039 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.908 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.584 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 458.619 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 475.564 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 476.445 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 461.818 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 463.274 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 464.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 466.754 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 481.417 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 478.971 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 488.346 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 489.099 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 478.063 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 477.882 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 487.670 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 485.269 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.050 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.372 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.725 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.865 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.112 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.454 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.402 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.226 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.271 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.270 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.606 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.949 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.865 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.525 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.816 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 259.752 ms +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 5 minutes 7 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/_sage_attention_44b112f_dirty/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext/sage_attention/_sage_attention_44b112f_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/vas14gsv17kdkap3r9szvjsclanmwq25-sage_attention-torch-ext +sage_attention-torch-ext> Running phase: installCheckPhase +sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing +sage_attention-torch-ext> Checking of ABI compatibility +sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 +sage_attention-torch-ext> ✅ No compatibility issues found +sage_attention-torch-ext> Checking loading kernel with get_kernel +sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention +building '/nix/store/xq28asxbqp6g7x8bcz92xl849prg2899-torch-ext-bundle.drv'... +building '/nix/store/rkzh9xwk6kdgl1by4xfwmyvb5arpfqby-build-and-copy.drv'... diff --git a/sage_attention/cuda_tensormap_shim.cuh b/sage_attention/cuda_tensormap_shim.cuh index 2b8dc2ff500a7486611c15d9ede13333854235d6..cadddb6646e4c38e76e1fe0cfefbf8e17604b7ed 100644 --- a/sage_attention/cuda_tensormap_shim.cuh +++ b/sage_attention/cuda_tensormap_shim.cuh @@ -1,61 +1,46 @@ -/* - * Lightweight compatibility shim for CUDA tensor map APIs. - * Provides fallbacks for CUtensorMap and related enums when compiling - * against CUDA toolkits that don't expose these symbols in headers. - */ - #pragma once - #include -// Guard on CUDA version and symbol presence. Some environments have -// runtime symbols but not headers; we define minimal stand-ins. +// Provide fallbacks only if CUDA headers don’t define tensor map +#if !defined(CU_TENSOR_MAP_NUM_QWORDS) -#ifndef CU_TENSOR_MAP_L2_PROMOTION_NONE -typedef enum CUtensorMapL2promotion_enum { - CU_TENSOR_MAP_L2_PROMOTION_NONE = 0, - CU_TENSOR_MAP_L2_PROMOTION_L2_64B = 1, - CU_TENSOR_MAP_L2_PROMOTION_L2_128B = 2 -} CUtensorMapL2promotion_enum; +// Layout-compatible stand-in +#if defined(__cplusplus) && (__cplusplus >= 201103L) +struct alignas(64) CUtensorMap_st { unsigned long long opaque[16]; }; +#else +struct CUtensorMap_st { unsigned long long opaque[16]; }; #endif - -#ifndef CUtensorMap -typedef struct CUtensorMap_st { - unsigned long long data[16]; -} CUtensorMap; -#endif - -#ifndef CU_TENSOR_MAP_DATA_TYPE_UINT8 -typedef enum CUtensorMapDataType { - CU_TENSOR_MAP_DATA_TYPE_UINT8 = 1, - CU_TENSOR_MAP_DATA_TYPE_INT8 = 2, - CU_TENSOR_MAP_DATA_TYPE_FLOAT16 = 10, - CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 = 13 +typedef CUtensorMap_st CUtensorMap; + +// Minimal enums used by create_tensor_map_4D +typedef enum CUtensorMapDataType_enum { + CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, + CU_TENSOR_MAP_DATA_TYPE_FLOAT16 = 6, + CU_TENSOR_MAP_DATA_TYPE_FLOAT32 = 7, + CU_TENSOR_MAP_DATA_TYPE_FLOAT64 = 8, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 = 10 } CUtensorMapDataType; -#endif -#ifndef CU_TENSOR_MAP_INTERLEAVE_NONE typedef enum CUtensorMapInterleave_enum { - CU_TENSOR_MAP_INTERLEAVE_NONE = 0 -} CUtensorMapInterleave_enum; -#endif + CU_TENSOR_MAP_INTERLEAVE_NONE = 0 +} CUtensorMapInterleave; -#ifndef CU_TENSOR_MAP_SWIZZLE_32B typedef enum CUtensorMapSwizzle_enum { - CU_TENSOR_MAP_SWIZZLE_NONE = 0, - CU_TENSOR_MAP_SWIZZLE_32B = 1, - CU_TENSOR_MAP_SWIZZLE_64B = 2, - CU_TENSOR_MAP_SWIZZLE_128B = 3 -} CUtensorMapSwizzle_enum; -#endif + CU_TENSOR_MAP_SWIZZLE_NONE = 0, + CU_TENSOR_MAP_SWIZZLE_32B, + CU_TENSOR_MAP_SWIZZLE_64B, + CU_TENSOR_MAP_SWIZZLE_128B +} CUtensorMapSwizzle; -#ifndef CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE -typedef enum CUtensorMapFloatOOBfill_enum { - CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0 -} CUtensorMapFloatOOBfill_enum; -#endif - -// We intentionally do not declare cuTensorMapEncodeTiled here; the code -// dynamically resolves it from libcuda at runtime when available. +typedef enum CUtensorMapL2promotion_enum { + CU_TENSOR_MAP_L2_PROMOTION_NONE = 0, + CU_TENSOR_MAP_L2_PROMOTION_L2_64B, + CU_TENSOR_MAP_L2_PROMOTION_L2_128B +} CUtensorMapL2promotion; +typedef enum CUtensorMapFloatOOBfill_enum { + CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0 +} CUtensorMapFloatOOBfill; +#endif // !defined(CU_TENSOR_MAP_NUM_QWORDS) +// no declaration of cuTensorMapEncodeTiled here; it’s resolved at runtime \ No newline at end of file diff --git a/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh b/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh index d829aa44b90259cc9b1a647e20f6056d41f93a9f..be88db5b9639eafe7163e3091bd343afec489167 100644 --- a/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh +++ b/sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh @@ -15,7 +15,7 @@ */ #include "../utils.cuh" -#include +// #include #include #include diff --git a/torch-ext/sage_attention/_ops.py b/torch-ext/sage_attention/_ops.py deleted file mode 100644 index 948e49e1434a82a7b94b2d78caf7d588365d9e72..0000000000000000000000000000000000000000 --- a/torch-ext/sage_attention/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _sage_attention_57cb7ec_dirty -ops = torch.ops._sage_attention_57cb7ec_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_sage_attention_57cb7ec_dirty::{op_name}" \ No newline at end of file diff --git a/torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so b/torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so deleted file mode 100755 index f14c4d32d36a2eccc732740e76a085337076546b..0000000000000000000000000000000000000000 --- a/torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:afa4831d0d218167c818a3871cf9fc01f154a6fc3c4671efdfede77a83e3b083 -size 26036368 diff --git a/torch-ext/torch_binding.cpp b/torch-ext/torch_binding.cpp index 54b6d54261acf81d499d93fad0e814887d6a337a..6bf79b2239679f5258844f7f51be55553d340c1c 100644 --- a/torch-ext/torch_binding.cpp +++ b/torch-ext/torch_binding.cpp @@ -2,260 +2,7 @@ #include "registration.h" #include "torch_binding.h" -#include -void sm_check_89(torch::Tensor x, std::string op_name) { - int device_index = x.get_device(); - const auto& prop = at::cuda::getDeviceProperties(device_index); - - std::cerr << "sm_check_89: prop->major: " << prop->major << std::endl; - std::cerr << "sm_check_89: prop->minor: " << prop->minor << std::endl; - - if (prop->major < 8 || (prop->major == 8 && prop->minor < 9)) { - TORCH_CHECK(false, op_name + " requires compute capability 8.9+"); - } -} - -void sm_check_90(torch::Tensor x, std::string op_name) { - int device_index = x.get_device(); - const auto& prop = at::cuda::getDeviceProperties(device_index); - - std::cerr << "sm_check_90: prop->major: " << prop->major << std::endl; - std::cerr << "sm_check_90: prop->minor: " << prop->minor << std::endl; - - if (prop->major < 9) { - TORCH_CHECK(false, op_name + " requires compute capability 9.0+"); - } -} - -void sm_check_80(torch::Tensor x, std::string op_name) { - int device_index = x.get_device(); - const auto& prop = at::cuda::getDeviceProperties(device_index); - std::cerr << "sm_check_80: prop->major: " << prop->major << std::endl; - std::cerr << "sm_check_80: prop->minor: " << prop->minor << std::endl; - if (prop->major < 8) { - TORCH_CHECK(false, op_name + " requires compute capability 8.0+"); - } -} - -// ############################################################################## -// SM89 -// ############################################################################## -static at::Tensor qk_int8_sv_f8_accum_f32_attn_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn"); - return qk_int8_sv_f8_accum_f32_attn( - q, k, v, o, q_scale, k_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn"); - return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( - q, k, v, o, q_scale, k_scale, v_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, at::Tensor v_mean, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn"); - return qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn( - q, k, v, o, q_scale, k_scale, v_scale, v_mean, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf"); - return qk_int8_sv_f8_accum_f32_attn_inst_buf( - q, k, v, o, q_scale, k_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f8_accum_f16_attn_inst_buf_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_89(q, "qk_int8_sv_f8_accum_f16_attn_inst_buf"); - return qk_int8_sv_f8_accum_f16_attn_inst_buf( - q, k, v, o, q_scale, k_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf"); - return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf( - q, k, v, o, q_scale, k_scale, v_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_89(q, "qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf"); - return qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf( - q, k, v, o, q_scale, k_scale, v_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - - -// ############################################################################## -// SM90 -// ############################################################################## - -static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_90(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90"); -return qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90( - q, k, v, o, q_scale, k_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_90(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90"); -return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( - q, k, v, o, q_scale, k_scale, v_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -// ############################################################################## -// SM80 -// ############################################################################## -static at::Tensor qk_int8_sv_f16_accum_f32_attn_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_80(q, "qk_int8_sv_f16_accum_f32_attn"); - return qk_int8_sv_f16_accum_f32_attn( - q, k, v, o, q_scale, k_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f16_accum_f16_attn_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn"); - return qk_int8_sv_f16_accum_f16_attn( - q, k, v, o, q_scale, k_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f16_accum_f16_attn_inst_buf_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn_inst_buf"); - return qk_int8_sv_f16_accum_f16_attn_inst_buf( - q, k, v, o, q_scale, k_scale, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -static at::Tensor qk_int8_sv_f16_accum_f16_fuse_v_mean_attn_wrap( - at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, - at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_mean, - int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, - double sm_scale, int64_t return_lse) { - sm_check_80(q, "qk_int8_sv_f16_accum_f16_fuse_v_mean_attn"); - return qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( - q, k, v, o, q_scale, k_scale, v_mean, - static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), - static_cast(sm_scale), static_cast(return_lse)); -} - -// Fused -static void quant_per_block_int8_cuda_wrap( - at::Tensor input, at::Tensor output, at::Tensor scale, - double sm_scale, int64_t block_size, int64_t tensor_layout) { - quant_per_block_int8_cuda( - input, output, scale, - static_cast(sm_scale), static_cast(block_size), static_cast(tensor_layout)); -} - -static void quant_per_block_int8_fuse_sub_mean_cuda_wrap( - at::Tensor input, at::Tensor mean, at::Tensor output, at::Tensor scale, - int64_t block_size, int64_t tensor_layout) { - quant_per_block_int8_fuse_sub_mean_cuda( - input, mean, output, scale, - static_cast(block_size), static_cast(tensor_layout)); -} - -static void quant_per_warp_int8_cuda_wrap( - at::Tensor input, at::Tensor output, at::Tensor scale, - int64_t block_size, int64_t warp_block_size, int64_t tensor_layout) { - quant_per_warp_int8_cuda( - input, output, scale, - static_cast(block_size), static_cast(warp_block_size), static_cast(tensor_layout)); -} - -static void sub_mean_cuda_wrap( - at::Tensor input, at::Tensor mean, at::Tensor output, - int64_t tensor_layout) { - sub_mean_cuda(input, mean, output, static_cast(tensor_layout)); -} - -static void transpose_pad_permute_cuda_wrap( - at::Tensor input, at::Tensor output, int64_t tensor_layout) { - transpose_pad_permute_cuda(input, output, static_cast(tensor_layout)); -} - -static void scale_fuse_quant_cuda_wrap( - at::Tensor input, at::Tensor output, at::Tensor scale, - int64_t num_tokens, double scale_max, int64_t tensor_layout) { - scale_fuse_quant_cuda( - input, output, scale, - static_cast(num_tokens), static_cast(scale_max), static_cast(tensor_layout)); -} - -static void mean_scale_fuse_quant_cuda_wrap( - at::Tensor input, at::Tensor output, at::Tensor mean, at::Tensor scale, - int64_t num_tokens, double scale_max, int64_t tensor_layout) { - mean_scale_fuse_quant_cuda( - input, output, mean, scale, - static_cast(num_tokens), static_cast(scale_max), static_cast(tensor_layout)); -} TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // SM90 diff --git a/torch-ext/torch_binding.h b/torch-ext/torch_binding.h index 4be1bb053aecd753a96f93f3f3f2cf95c9095c84..46bed40d0868d47b6fb896149e760b61e9ba2ec9 100644 --- a/torch-ext/torch_binding.h +++ b/torch-ext/torch_binding.h @@ -1,6 +1,7 @@ #pragma once #include +#include // SM80 torch::Tensor qk_int8_sv_f16_accum_f32_attn(torch::Tensor query, @@ -218,4 +219,261 @@ void mean_scale_fuse_quant_cuda( torch::Tensor scale, int num_tokens, float scale_max, - int tensor_layout); \ No newline at end of file + int tensor_layout); + + + +void sm_check_89(torch::Tensor x, std::string op_name) { + int device_index = x.get_device(); + const auto& prop = at::cuda::getDeviceProperties(device_index); + + std::cerr << "sm_check_89: prop->major: " << prop->major << std::endl; + std::cerr << "sm_check_89: prop->minor: " << prop->minor << std::endl; + + if (prop->major < 8 || (prop->major == 8 && prop->minor < 9)) { + TORCH_CHECK(false, op_name + " requires compute capability 8.9+"); + } +} + +void sm_check_90(torch::Tensor x, std::string op_name) { + int device_index = x.get_device(); + const auto& prop = at::cuda::getDeviceProperties(device_index); + + std::cerr << "sm_check_90: prop->major: " << prop->major << std::endl; + std::cerr << "sm_check_90: prop->minor: " << prop->minor << std::endl; + + if (prop->major < 9) { + TORCH_CHECK(false, op_name + " requires compute capability 9.0+"); + } +} + +void sm_check_80(torch::Tensor x, std::string op_name) { + int device_index = x.get_device(); + const auto& prop = at::cuda::getDeviceProperties(device_index); + std::cerr << "sm_check_80: prop->major: " << prop->major << std::endl; + std::cerr << "sm_check_80: prop->minor: " << prop->minor << std::endl; + if (prop->major < 8) { + TORCH_CHECK(false, op_name + " requires compute capability 8.0+"); + } +} + +// ############################################################################## +// SM89 +// ############################################################################## +static at::Tensor qk_int8_sv_f8_accum_f32_attn_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn"); + return qk_int8_sv_f8_accum_f32_attn( + q, k, v, o, q_scale, k_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn"); + return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn( + q, k, v, o, q_scale, k_scale, v_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, at::Tensor v_mean, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn"); + return qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn( + q, k, v, o, q_scale, k_scale, v_scale, v_mean, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_89(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf"); + return qk_int8_sv_f8_accum_f32_attn_inst_buf( + q, k, v, o, q_scale, k_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f8_accum_f16_attn_inst_buf_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_89(q, "qk_int8_sv_f8_accum_f16_attn_inst_buf"); + return qk_int8_sv_f8_accum_f16_attn_inst_buf( + q, k, v, o, q_scale, k_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_89(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf"); + return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf( + q, k, v, o, q_scale, k_scale, v_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_89(q, "qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf"); + return qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf( + q, k, v, o, q_scale, k_scale, v_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + + +// ############################################################################## +// SM90 +// ############################################################################## + +static at::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_90(q, "qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90"); +return qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90( + q, k, v, o, q_scale, k_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_90(q, "qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90"); +return qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( + q, k, v, o, q_scale, k_scale, v_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +// ############################################################################## +// SM80 +// ############################################################################## +static at::Tensor qk_int8_sv_f16_accum_f32_attn_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_80(q, "qk_int8_sv_f16_accum_f32_attn"); + return qk_int8_sv_f16_accum_f32_attn( + q, k, v, o, q_scale, k_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f16_accum_f16_attn_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn"); + return qk_int8_sv_f16_accum_f16_attn( + q, k, v, o, q_scale, k_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f16_accum_f16_attn_inst_buf_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn_inst_buf"); + return qk_int8_sv_f16_accum_f16_attn_inst_buf( + q, k, v, o, q_scale, k_scale, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +static at::Tensor qk_int8_sv_f16_accum_f16_fuse_v_mean_attn_wrap( + at::Tensor q, at::Tensor k, at::Tensor v, at::Tensor o, + at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_mean, + int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran, + double sm_scale, int64_t return_lse) { + sm_check_80(q, "qk_int8_sv_f16_accum_f16_fuse_v_mean_attn"); + return qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + q, k, v, o, q_scale, k_scale, v_mean, + static_cast(tensor_layout), static_cast(is_causal), static_cast(qk_quant_gran), + static_cast(sm_scale), static_cast(return_lse)); +} + +// ############################################################################## +// Fused +// ############################################################################## +static void quant_per_block_int8_cuda_wrap( + at::Tensor input, at::Tensor output, at::Tensor scale, + double sm_scale, int64_t block_size, int64_t tensor_layout) { + quant_per_block_int8_cuda( + input, output, scale, + static_cast(sm_scale), static_cast(block_size), static_cast(tensor_layout)); +} + +static void quant_per_block_int8_fuse_sub_mean_cuda_wrap( + at::Tensor input, at::Tensor mean, at::Tensor output, at::Tensor scale, + int64_t block_size, int64_t tensor_layout) { + quant_per_block_int8_fuse_sub_mean_cuda( + input, mean, output, scale, + static_cast(block_size), static_cast(tensor_layout)); +} + +static void quant_per_warp_int8_cuda_wrap( + at::Tensor input, at::Tensor output, at::Tensor scale, + int64_t block_size, int64_t warp_block_size, int64_t tensor_layout) { + quant_per_warp_int8_cuda( + input, output, scale, + static_cast(block_size), static_cast(warp_block_size), static_cast(tensor_layout)); +} + +static void sub_mean_cuda_wrap( + at::Tensor input, at::Tensor mean, at::Tensor output, + int64_t tensor_layout) { + sub_mean_cuda(input, mean, output, static_cast(tensor_layout)); +} + +static void transpose_pad_permute_cuda_wrap( + at::Tensor input, at::Tensor output, int64_t tensor_layout) { + transpose_pad_permute_cuda(input, output, static_cast(tensor_layout)); +} + +static void scale_fuse_quant_cuda_wrap( + at::Tensor input, at::Tensor output, at::Tensor scale, + int64_t num_tokens, double scale_max, int64_t tensor_layout) { + scale_fuse_quant_cuda( + input, output, scale, + static_cast(num_tokens), static_cast(scale_max), static_cast(tensor_layout)); +} + +static void mean_scale_fuse_quant_cuda_wrap( + at::Tensor input, at::Tensor output, at::Tensor mean, at::Tensor scale, + int64_t num_tokens, double scale_max, int64_t tensor_layout) { + mean_scale_fuse_quant_cuda( + input, output, mean, scale, + static_cast(num_tokens), static_cast(scale_max), static_cast(tensor_layout)); +} \ No newline at end of file