medmekk HF Staff commited on 28 days ago

Commit

af2d0c0

1 Parent(s): 44b112f

add some builds

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

CMakeLists.txt +1 -0
build.toml +10 -6
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py +12 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc +0 -0
{torch-ext → build/torch27-cxx11-cu126-x86_64-linux}/sage_attention/_ops.py +3 -3
torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so → build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so +2 -2
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py +983 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py +0 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py +326 -0
build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py +204 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py +12 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py +9 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py +983 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py +0 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py +326 -0
build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py +204 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py +12 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py +9 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so +3 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py +983 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py +326 -0
build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py +204 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py +12 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py +9 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py +983 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py +326 -0
build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py +204 -0

CMakeLists.txt CHANGED Viewed

@@ -142,6 +142,7 @@ set(_qattn_sm90_SRC
   "sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu"
 "sage_attention/qattn/attn_cuda_sm90.h"
 "sage_attention/qattn/attn_utils.cuh"
 )
 # TODO: check if CLion support this:

   "sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu"
 "sage_attention/qattn/attn_cuda_sm90.h"
 "sage_attention/qattn/attn_utils.cuh"
+"sage_attention/cuda_tensormap_shim.cuh"
 )
 # TODO: check if CLion support this:

build.toml CHANGED Viewed

@@ -1,21 +1,20 @@
 [general]
 name = "sage_attention"
 universal = false
 [torch]
 src = [
   "torch-ext/torch_binding.cpp",
   "torch-ext/torch_binding.h",
 ]
-cuda-capabilities = [
-    "8.0", "9.0"
-]
 [kernel._qattn]
 depends = ["torch"]
 backend = "cuda"
 cuda-capabilities = [
-    "9.0"
 ]
 src = [
     "sage_attention/cp_async.cuh",
@@ -27,6 +26,7 @@ src = [
     "sage_attention/reduction_utils.cuh",
     "sage_attention/wgmma.cuh",
     "sage_attention/utils.cuh",
 ]
 cxx-flags = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"]
 cuda-flags = [
@@ -43,6 +43,7 @@ cuda-flags = [
 [kernel._qattn_sm80]
 depends = ["torch"]
 backend = "cuda"
 cuda-capabilities = [
     "8.0"
 ]
@@ -68,6 +69,7 @@ cuda-flags = [
 [kernel._qattn_sm89]
 depends = ["torch"]
 backend = "cuda"
 cuda-capabilities = [
     "8.9",
 ]
@@ -100,8 +102,9 @@ cuda-flags = [
 [kernel._qattn_sm90]
 depends = ["torch"]
 backend = "cuda"
 cuda-capabilities = [
-    "9.0",
 ]
 include = ["."]
 src = [
@@ -124,8 +127,9 @@ cuda-flags = [
 [kernel._fused]
 depends = ["torch"]
 backend = "cuda"
 cuda-capabilities = [
-    "9.0",
 ]
 include = ["."]
 src = [

 [general]
 name = "sage_attention"
 universal = false
+cuda-minver = "12.4"
 [torch]
 src = [
   "torch-ext/torch_binding.cpp",
   "torch-ext/torch_binding.h",
 ]
 [kernel._qattn]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
+    "8.0", "8.9", "9.0a"
 ]
 src = [
     "sage_attention/cp_async.cuh",
     "sage_attention/reduction_utils.cuh",
     "sage_attention/wgmma.cuh",
     "sage_attention/utils.cuh",
+    "sage_attention/cuda_tensormap_shim.cuh",
 ]
 cxx-flags = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"]
 cuda-flags = [
 [kernel._qattn_sm80]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
     "8.0"
 ]
 [kernel._qattn_sm89]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
     "8.9",
 ]
 [kernel._qattn_sm90]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
+    "9.0a",
 ]
 include = ["."]
 src = [
 [kernel._fused]
 depends = ["torch"]
 backend = "cuda"
+cuda-minver = "12.4"
 cuda-capabilities = [
+    "8.0", "8.9", "9.0a",
 ]
 include = ["."]
 src = [

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (433 Bytes). View file

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (550 Bytes). View file

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc ADDED Viewed

Binary file (33.4 kB). View file

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc ADDED Viewed

Binary file (13.4 kB). View file

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc ADDED Viewed

Binary file (13 kB). View file

{torch-ext → build/torch27-cxx11-cu126-x86_64-linux}/sage_attention/_ops.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _sage_attention_57cb7ec_dirty
-ops = torch.ops._sage_attention_57cb7ec_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_sage_attention_57cb7ec_dirty::{op_name}"

 import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_sage_attention_44b112f_dirty::{op_name}"

torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so → build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:afa4831d0d218167c818a3871cf9fc01f154a6fc3c4671efdfede77a83e3b083
-size 26036368

 version https://git-lfs.github.com/spec/v1
+oid sha256:b577da1986b76b2571e8dd55412621e6fc85fe1a2f847bc0a5af9851bf388cf2
+size 26037568

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py ADDED Viewed

	@@ -0,0 +1,983 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+import subprocess
+import re
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py ADDED Viewed

File without changes

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+from typing import Optional
+from ._ops import ops
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    sm_scale *= 1.44269504
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+    return v_smoothed, vm
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None

build/torch27-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if km is not None:
+        k = k - km
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+    return q_int8, q_scale, k_int8, k_scale

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (433 Bytes). View file

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (550 Bytes). View file

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc ADDED Viewed

Binary file (33.4 kB). View file

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc ADDED Viewed

Binary file (13.4 kB). View file

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc ADDED Viewed

Binary file (13 kB). View file

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d47c952dd9781283ff0dcbd533779de33b0bfa1966dcc0cc8accd0412217c1c5
+size 26553840

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py ADDED Viewed

	@@ -0,0 +1,983 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+import subprocess
+import re
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py ADDED Viewed

File without changes

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+from typing import Optional
+from ._ops import ops
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    sm_scale *= 1.44269504
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+    return v_smoothed, vm
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None

build/torch27-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if km is not None:
+        k = k - km
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+    return q_int8, q_scale, k_int8, k_scale

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (433 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (550 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc ADDED Viewed

Binary file (33.4 kB). View file

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc ADDED Viewed

Binary file (13.4 kB). View file

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc ADDED Viewed

Binary file (13 kB). View file

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28e181de0c6388653fb4b8b2d7347f1f547fc84fe7dc45bc66db9b1431d141bc
+size 26037392

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py ADDED Viewed

	@@ -0,0 +1,983 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+import subprocess
+import re
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py ADDED Viewed

File without changes

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+from typing import Optional
+from ._ops import ops
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    sm_scale *= 1.44269504
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+    return v_smoothed, vm
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None

build/torch28-cxx11-cu126-x86_64-linux/sage_attention/quant_per_thread.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if km is not None:
+        k = k - km
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+    return q_int8, q_scale, k_int8, k_scale

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
+__all__ = [
+    "per_block_int8",
+    "per_warp_int8",
+    "sub_mean",
+    "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
+]

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (433 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (550 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc ADDED Viewed

Binary file (33.4 kB). View file

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc ADDED Viewed

Binary file (13.4 kB). View file

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc ADDED Viewed

Binary file (13 kB). View file

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sage_attention_44b112f_dirty
+ops = torch.ops._sage_attention_44b112f_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sage_attention_44b112f_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_44b112f_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:826ab66e6c33b3b2b17c30371934a55e972d560197c5492f4dedf6fcc29f1a1e
+size 26553920

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py ADDED Viewed

	@@ -0,0 +1,983 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+import subprocess
+import re
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py ADDED Viewed

File without changes

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+from typing import Optional
+from ._ops import ops
+def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` and the key tensor `k` with per block quantization.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
+        It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    sm_scale *= 1.44269504
+    ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def per_warp_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
+):
+    """
+    Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
+    Warp size of quantizing `q` is 16 or 32, with a block size of 64 or 128.
+    Block size of quantizing `k` is 64 or 128.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    km : Optional[torch.Tensor]
+        The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
+        Should be of the same dtype as `k` if provided. Default is None.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The quantized query tensor. Shape: Same as `q` but with `int8` dtype.
+        - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
+        - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
+        - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    """
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
+    ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
+    if km is not None:
+        km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
+    else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
+    return q_int8, q_scale, k_int8, k_scale
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
+    """
+    Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        A tuple containing:
+        - The tensor `v_smoothed` with the mean subtracted and stored as fp16. Shape: Same as `v` with `float16` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with dtype same as `v`.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned tensor `v_smoothed` will have dtype ``torch.float16`` regardless of the input dtype.
+    - The returned mean tensor will have the same dtype as the input tensor.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
+    v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
+    # subtract mean and store the result as fp16
+    ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
+    return v_smoothed, vm
+def per_channel_fp8(
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    scale_max: float = 448.0,
+    smooth_v: bool = True,
+):
+    """
+    Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
+    `v` is first transposed along the head dimension and the sequence length dimension, then padded to a multiple of 64.
+    After that, the tensor is permuted along the sequence length dimension by ``[0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15]``.
+    The quantization is done per channel, with the scale value and smooth factor calculated per channel.
+    Parameters
+    ----------
+    v : torch.Tensor
+        The input tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    scale_max : float
+        The maximum scale value for the quantization. Default is 448.0 (upper bound of E4M3 data format).
+    smooth_v : bool
+        Whether to smooth the quantized tensor. Default is True.
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]
+        A tuple containing:
+        - The quantized tensor `v_fp8`. Shape:
+            - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, head_dim, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+            - If `tensor_layout` is "NHD": ``[batch_size, head_dim, num_kv_heads, (kv_len + 63) // 64 * 64]``, with `float8_e4m3fn` dtype.
+        - The scale tensor of `v`. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+        - The mean tensor of `v` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]`` with `float32` dtype.
+    Note
+    ----
+    - The tensors `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - The returned mean tensor will be None if `smooth_v` is False. Otherwise it will have dtype ``torch.float32``.
+    """
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    if tensor_layout == "HND":
+        b, h_kv, kv_len, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
+    elif tensor_layout == "NHD":
+        b, kv_len, h_kv, head_dim = v.shape
+        padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
+    ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
+    v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+    if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
+        return v_fp8, v_scale, vm
+    else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
+        return v_fp8, v_scale, None

build/torch28-cxx11-cu128-x86_64-linux/sage_attention/quant_per_thread.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if km is not None:
+        k = k - km
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+    return q_int8, q_scale, k_int8, k_scale