Spaces:

microsoft
/

MInference

Running on Zero

File size: 28,858 Bytes

43a7079

import numpy as np
import pycuda.autoprimaryctx
import torch
import triton
import triton.language as tl
from flash_attn import flash_attn_varlen_func
from pycuda.compiler import SourceModule


@triton.autotune(
   configs=[
       triton.Config({}, num_stages=1, num_warps=4),
       triton.Config({}, num_stages=1, num_warps=8),
       triton.Config({}, num_stages=2, num_warps=4),
       triton.Config({}, num_stages=2, num_warps=8),
       triton.Config({}, num_stages=3, num_warps=4),
       triton.Config({}, num_stages=3, num_warps=8),
       triton.Config({}, num_stages=4, num_warps=4),
       triton.Config({}, num_stages=4, num_warps=8),
       triton.Config({}, num_stages=5, num_warps=4),
       triton.Config({}, num_stages=5, num_warps=8),
   ],
   key=['N_CTX'],
)
@triton.jit
def triton_sparse_fwd_kernel(
    Q, K, V, seqlens, sm_scale,
    col_count, col_index,
    Out,
    stride_qz, stride_qh, stride_qm, stride_qk,
    stride_kz, stride_kh, stride_kn, stride_kk,
    stride_vz, stride_vh, stride_vn, stride_vk,
    stride_oz, stride_oh, stride_om, stride_ok,
    Z, H, N_CTX,
    NUM_ROWS, MAX_COLS_PRE_ROW,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    dtype: tl.constexpr,
):
    start_m = tl.program_id(0)
    off_hz = tl.program_id(1)

    seqlen = tl.load(seqlens + off_hz // H)
    if start_m * BLOCK_M >= seqlen:
        return

    # initialize offsets
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    offs_d = tl.arange(0, BLOCK_DMODEL)

    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh

    q_ptrs = Q + qo_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
    k_ptrs = K + kv_offset + offs_d[:, None] * stride_kk
    v_ptrs = V + kv_offset + offs_d[None, :] * stride_vk
    o_ptrs = Out + qo_offset + offs_m[:, None] * stride_om + offs_d[None, :] * stride_ok

    num_cols = tl.load(col_count + off_hz * NUM_ROWS + start_m)
    cols_ptr = col_index + (off_hz * NUM_ROWS + start_m) * MAX_COLS_PRE_ROW

    # initialize pointer to m and l
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
    # scale sm_scale by log_2(e) and use
    # 2^x instead of exp in the loop because CSE and LICM
    # don't work as expected with `exp` in the loop
    qk_scale = sm_scale * 1.44269504
    # load q: it will stay in SRAM throughout
    q = tl.load(q_ptrs)
    q = (q * qk_scale).to(dtype)

    # loop over k, v and update accumulator
    m_mask = offs_m[:, None] < seqlen
    split = tl.maximum(num_cols - BLOCK_N, 0) & ~(BLOCK_N - 1)

    for start_n in range(0, split, BLOCK_N):
        cols = tl.load(cols_ptr + start_n + offs_n)
        # -- load k, v --
        k = tl.load(k_ptrs + cols[None, :] * stride_kn)
        v = tl.load(v_ptrs + cols[:, None] * stride_vn)
        # -- compute qk --
        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        qk = tl.where(m_mask, qk, float("-inf"))
        qk += tl.dot(q, k)
        # -- compute scaling constant --
        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
        alpha = tl.math.exp2(m_i - m_i_new)
        p = tl.math.exp2(qk - m_i_new[:, None])
        # -- scale and update acc --
        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
        acc *= acc_scale[:, None]
        acc += tl.dot(p.to(dtype), v)
        # -- update m_i and l_i --
        l_i = l_i * alpha + tl.sum(p, 1)
        m_i = m_i_new

    for start_n in range(split, num_cols, BLOCK_N):
        n_mask = start_n + offs_n < num_cols
        cols = tl.load(cols_ptr + start_n + offs_n, mask=n_mask, other=N_CTX - 1)
        causal_mask = cols[None, :] <= offs_m[:, None]
        # -- load k, v --
        k = tl.load(k_ptrs + cols[None, :] * stride_kn)
        v = tl.load(v_ptrs + cols[:, None] * stride_vn)
        # -- compute qk --
        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        qk = tl.where(m_mask & causal_mask, qk, float("-inf"))
        qk += tl.dot(q, k)
        # -- compute scaling constant --
        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
        alpha = tl.math.exp2(m_i - m_i_new)
        p = tl.math.exp2(qk - m_i_new[:, None])
        # -- scale and update acc --
        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
        acc *= acc_scale[:, None]
        acc += tl.dot(p.to(dtype), v)
        # -- update m_i and l_i --
        l_i = l_i * alpha + tl.sum(p, 1)
        m_i = m_i_new

    # write back O
    acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
    tl.store(o_ptrs, acc.to(dtype), mask=m_mask)


def triton_sparse_forward(
    q,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
    k,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
    v,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
    seqlens,           # [BATCH, ]
    col_count,         # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
    col_index,         # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), MAX_COLS_PRE_ROW]
    sm_scale,
    block_size_M=64,
    block_size_N=64,
) -> torch.Tensor:
    # shape constraints
    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
    assert Lq == Lk and Lk == Lv
    assert Lk in {16, 32, 64, 128}
    o = torch.zeros_like(q)
    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
    num_warps = 4 if (Lk <= 64 or block_size_M <= 64) else 8  # 4
    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
    triton_sparse_fwd_kernel[grid](
        q, k, v, seqlens, sm_scale,
        col_count, col_index,
        o,
        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
        q.shape[0], q.shape[1], q.shape[2],
        col_index.shape[-2], col_index.shape[-1],
        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
        BLOCK_DMODEL=Lk,
        dtype=dtype,
        # num_warps=num_warps, num_stages=4,
    )

    return o


def torch_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M=64):
    max_cols_per_row = (seqlens.max().item() + 3) & (-4)
    batch_size, num_heads, NNZ_S = slash_indexes.shape
    NNZ_V = vertical_indexes.shape[-1]
    num_rows = triton.cdiv(max_cols_per_row, block_size_M)
    max_cols_per_row = max_cols_per_row
    col_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32)
    col_index = torch.zeros((batch_size, num_heads, num_rows, max_cols_per_row), dtype=torch.int32)
    for b in range(batch_size):
        seqlen = seqlens[b]
        for h in range(num_heads):
            for m, start_m in enumerate(range(0, seqlen, block_size_M)):
                end_m = start_m + block_size_M
                tmp_col_count = 0
                cursor, s, v = -1, 0, 0
                v_idx = vertical_indexes[b, h, v].item()
                while s < NNZ_S and slash_indexes[b, h, s] >= end_m:
                    s += 1
                if s < NNZ_S:
                    s_idx = end_m - slash_indexes[b, h, s].item()
                    s_range = min(s_idx, block_size_M)
                else:
                    s_idx = seqlen
                    s_range = 0
                while s_idx <= end_m and v_idx < end_m:
                    if v_idx < s_idx:
                        if v_idx < s_idx - s_range:
                            col_index[b, h, m, tmp_col_count] = v_idx
                            tmp_col_count += 1
                        v += 1
                        if v < NNZ_V:
                            v_idx = vertical_indexes[b, h, v].item()
                        else:
                            break
                    else:
                        for idx in range(max(cursor, s_idx - s_range), min(s_idx, seqlen)):
                            col_index[b, h, m, tmp_col_count] = idx
                            tmp_col_count += 1
                        cursor = s_idx
                        s += 1
                        if s < NNZ_S:
                            s_idx = end_m - slash_indexes[b, h, s].item()
                            s_range = min(s_idx, block_size_M)
                        else:
                            break
                while s_idx <= end_m and s < NNZ_S:
                    for idx in range(max(cursor, s_idx - s_range), min(s_idx, seqlen)):
                        col_index[b, h, m, tmp_col_count] = idx
                        tmp_col_count += 1
                    cursor = s_idx
                    s += 1
                    if s < NNZ_S:
                        s_idx = end_m - slash_indexes[b, h, s].item()
                        s_range = min(s_idx, block_size_M)
                    else:
                        break
                while v_idx < end_m and v < NNZ_V:
                    if v_idx < s_idx - s_range:
                        col_index[b, h, m, tmp_col_count] = v_idx
                        tmp_col_count += 1
                    v += 1
                    if v < NNZ_V:
                        v_idx = vertical_indexes[b, h, v].item()
                    else:
                        break
                col_count[b, h, m] = tmp_col_count
    return col_count.to(seqlens.device), col_index.to(seqlens.device)



PYCUDA_BUILD_INDEX_KERNEL_CODE = '''\
__device__ int min(int x, int y) {
    return x < y ? x : y;
}

__device__ int max(int x, int y) {
    return x > y ? x : y;
}

__device__ void save_list(int* output, int loop_start, int loop_end, int& offset) {
    if (loop_start + 4 >= loop_end) {
        for (int idx = loop_start; idx < loop_end; idx++, offset++) {
            output[offset] = idx;
        }
        return;
    }
    int4 tmp_int4;
    int int4_start = ((offset + 3) & (-4)) - offset + loop_start;
    int int4_end = ((offset + loop_end - loop_start) & (-4)) - offset + loop_start;
    for (int idx = loop_start; idx < int4_start; idx++, offset++) {
        output[offset] = idx;
    }
    for (int idx = int4_start; idx < int4_end; idx += 4, offset += 4) {
        tmp_int4.x = idx + 0;
        tmp_int4.y = idx + 1;
        tmp_int4.z = idx + 2;
        tmp_int4.w = idx + 3;
        (reinterpret_cast<int4*>(&output[offset]))[0] = tmp_int4;
    }
    for (int idx = int4_end; idx < loop_end; idx++, offset++) {
        output[offset] = idx;
    }
}

__global__ void PYCUDA_BUILD_INDEX_KERNEL(
    const int* seqlens,           // [BATCH, ]
    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
    int* col_count,               // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
    int* col_index,               // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), N_CTX]
    int N_HEADS,
    int N_CTX,
    int BLOCK_SIZE_M,
    int N_ROWS,
    int NNZ_V,
    int NNZ_S
) {
    const int batch_idx = blockIdx.y;
    const int head_idx = blockIdx.x;
    const int group_idx = blockIdx.z;

    int seqlen = seqlens[batch_idx];
    int block_idx_m = group_idx * blockDim.x + threadIdx.x;
    int start_m = block_idx_m * BLOCK_SIZE_M;
    if (start_m >= seqlen) {
        return;
    }
    int end_m = start_m + BLOCK_SIZE_M;
    vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
    slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
    int row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
    col_count += row_offset;
    col_index += row_offset * N_CTX;

    int tmp_col_count = 0, cursor = -1, s = 0, v = 0;
    int v_idx = vertical_indexes[v];
    /*
    int left = 0, right = NNZ_S - 1;
    int tmp_s_idx = 0, target = end_m - 1;
    s = (left + right) >> 1;
    while (left + 1 < right) {
        tmp_s_idx = slash_indexes[s];
        if (tmp_s_idx > target) {
            left = s;
        } else if (tmp_s_idx < target) {
            right = s;
        } else {
            break;
        }
        s = (left + right) >> 1;
    }
    */
    while (s < NNZ_S && slash_indexes[s] >= end_m) s++;

    int s_idx = (s < NNZ_S) ? (end_m - slash_indexes[s]) : seqlen;
    int s_range = (s < NNZ_S) ? min(s_idx, BLOCK_SIZE_M) : 0;

    while (s_idx <= end_m && v_idx < end_m) {
        if (v_idx < s_idx) {
            if (v_idx < s_idx - s_range) {
                col_index[tmp_col_count] = v_idx;
                tmp_col_count++;
            }
            v++;
            if (v < NNZ_V) {
                v_idx = vertical_indexes[v];
            } else {
                break;
            }
        } else {
            save_list(col_index, max(cursor, s_idx - s_range), min(s_idx, seqlen), tmp_col_count);
            cursor = s_idx;
            s++;
            if (s < NNZ_S) {
                s_idx = end_m - slash_indexes[s];
                s_range = min(s_idx, BLOCK_SIZE_M);
            } else {
                break;
            }
        }
    }
    while (s_idx <= end_m && s < NNZ_S) {
        save_list(col_index, max(cursor, s_idx - s_range), min(s_idx, seqlen), tmp_col_count);
        cursor = s_idx;
        s++;
        if (s < NNZ_S) {
            s_idx = end_m - slash_indexes[s];
            s_range = min(s_idx, BLOCK_SIZE_M);
        } else {
            break;
        }
    }
    while (v_idx < end_m && v < NNZ_V) {
        if (v_idx < s_idx - s_range) {
            col_index[tmp_col_count] = v_idx;
            tmp_col_count++;
        }
        v++;
        if (v < NNZ_V) {
            v_idx = vertical_indexes[v];
        } else {
            break;
        }
    }
    col_count[0] = tmp_col_count;
}
'''
PYCUDA_BUILD_INDEX_KERNEL = SourceModule(
    PYCUDA_BUILD_INDEX_KERNEL_CODE,
    options=['-std=c++14', '-O3'],
).get_function(f'PYCUDA_BUILD_INDEX_KERNEL')


def pycuda_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M=64):
    max_cols_per_row = (seqlens.max().item() + 3) & (-4)
    batch_size, num_heads, NNZ_S = slash_indexes.shape
    NNZ_V = vertical_indexes.shape[-1]
    num_rows = triton.cdiv(max_cols_per_row, block_size_M)
    max_cols_per_row = max_cols_per_row
    col_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32, device=seqlens.device)
    col_index = torch.zeros((batch_size, num_heads, num_rows, max_cols_per_row), dtype=torch.int32, device=seqlens.device)
    num_threads = 64
    PYCUDA_BUILD_INDEX_KERNEL(
        seqlens, vertical_indexes, slash_indexes,
        col_count, col_index,
        np.int32(num_heads), np.int32(max_cols_per_row), np.int32(block_size_M), np.int32(num_rows),
        np.int32(NNZ_V), np.int32(NNZ_S),
        # grid=(triton.cdiv(num_rows, num_threads), N_HEADS, BATCH),
        grid=(num_heads, batch_size, triton.cdiv(num_rows, num_threads)),
        block=(num_threads, 1, 1),
    )
    return col_count, col_index


def make_causal_mask(seqlens, device, context_size):
    batch_size = seqlens.shape[0]
    arange = torch.arange(context_size, dtype=torch.int32, device=device)
    causal_mask = arange[None, None, :, None] >= arange[None, None, None, :]
    causal_mask = causal_mask.repeat((batch_size, 1, 1, 1))
    for b, seqlen in enumerate(seqlens):
        causal_mask[b, :, seqlen:, :] = False
        causal_mask[b, :, :, seqlen:] = False
    return causal_mask


def make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device):
    batch_size, num_heads, _ = vertical_indexes.shape
    context_size = causal_mask.shape[-1]
    arange = torch.arange(context_size, dtype=torch.int32, device=device)
    sparse_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
    for b in range(batch_size):
        for h in range(num_heads):
            for vertical_index in vertical_indexes[b, h]:
                sparse_mask[b, h, :, vertical_index] = True
            for slash_index in slash_indexes[b, h]:
                sparse_mask[b, h].logical_or_(arange[:, None] - arange[None, :] == slash_index)
    sparse_mask.logical_and_(causal_mask)
    return sparse_mask


def make_block_mask(col_count, col_index, seqlens, causal_mask, device, block_size_M=64):
    batch_size, num_heads, _ = col_count.shape
    context_size = causal_mask.shape[-1]
    block_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
    for b in range(batch_size):
        for h in range(num_heads):
            for m, start_m in enumerate(range(0, seqlens[b], block_size_M)):
                end_m = start_m + block_size_M
                for c in range(col_count[b, h, m]):
                    block_mask[b, h, start_m:end_m, col_index[b, h, m, c]] = True
    block_mask.logical_and_(causal_mask)
    return block_mask


def plot_mask(mask, name, batch=0, head=0):
    import matplotlib.pyplot as plt
    import seaborn as sns
    plt.figure(figsize=(16, 12))
    plt.clf()
    mask = mask[batch, head].cpu().numpy()
    sns.heatmap(mask)
    plt.savefig(name)


@triton.jit
def triton_dense_fwd_kernel(
    Q, K, V, seqlens, sm_scale,
    Out,
    stride_qz, stride_qh, stride_qm, stride_qk,
    stride_kz, stride_kh, stride_kn, stride_kk,
    stride_vz, stride_vh, stride_vn, stride_vk,
    stride_oz, stride_oh, stride_om, stride_ok,
    Z, H, N_CTX,
    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    dtype: tl.constexpr,
):
    start_m = tl.program_id(0)
    off_hz = tl.program_id(1)

    seqlen = tl.load(seqlens + off_hz // H)
    if start_m * BLOCK_M >= seqlen:
        return

    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
    Q_block_ptr = tl.make_block_ptr(
        base=Q + qo_offset,
        shape=(N_CTX, BLOCK_DMODEL),
        strides=(stride_qm, stride_qk),
        offsets=(start_m * BLOCK_M, 0),
        block_shape=(BLOCK_M, BLOCK_DMODEL),
        order=(1, 0)
    )
    K_block_ptr = tl.make_block_ptr(
        base=K + kv_offset,
        shape=(BLOCK_DMODEL, N_CTX),
        strides=(stride_kk, stride_kn),
        offsets=(0, 0),
        block_shape=(BLOCK_DMODEL, BLOCK_N),
        order=(0, 1)
    )
    V_block_ptr = tl.make_block_ptr(
        base=V + kv_offset,
        shape=(N_CTX, BLOCK_DMODEL),
        strides=(stride_vn, stride_vk),
        offsets=(0, 0),
        block_shape=(BLOCK_N, BLOCK_DMODEL),
        order=(1, 0)
    )
    # initialize offsets
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    # initialize pointer to m and l
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
    # scale sm_scale by log_2(e) and use
    # 2^x instead of exp in the loop because CSE and LICM
    # don't work as expected with `exp` in the loop
    qk_scale = sm_scale * 1.44269504
    # load q: it will stay in SRAM throughout
    q = tl.load(Q_block_ptr)
    q = (q * qk_scale).to(dtype)
    # loop over k, v and update accumulator
    lo = 0
    hi = (start_m + 1) * BLOCK_M
    m_mask = offs_m[:, None] < seqlen

    for start_n in range(lo, hi, BLOCK_N):
        n_mask = (start_n + offs_n[None, :]) <= offs_m[:, None]
        # -- load k, v --
        k = tl.load(K_block_ptr)
        v = tl.load(V_block_ptr)
        # -- compute qk --
        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        qk = tl.where(m_mask & n_mask, qk, float("-inf"))
        qk += tl.dot(q, k)
        # -- compute scaling constant --
        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
        alpha = tl.math.exp2(m_i - m_i_new)
        p = tl.math.exp2(qk - m_i_new[:, None])
        # -- scale and update acc --
        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
        acc *= acc_scale[:, None]
        acc += tl.dot(p.to(dtype), v)
        # -- update m_i and l_i --
        l_i = l_i * alpha + tl.sum(p, 1)
        m_i = m_i_new
        # update pointers
        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
    # write back O
    acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
    O_block_ptr = tl.make_block_ptr(
        base=Out + qo_offset,
        shape=(N_CTX, BLOCK_DMODEL),
        strides=(stride_om, stride_ok),
        offsets=(start_m * BLOCK_M, 0),
        block_shape=(BLOCK_M, BLOCK_DMODEL),
        order=(1, 0)
    )
    tl.store(O_block_ptr, acc.to(dtype), mask=m_mask)


def triton_dense_forward(q, k, v, seqlens, sm_scale, block_size_M=128, block_size_N=64) -> torch.Tensor:
    # shape constraints
    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
    assert Lq == Lk and Lk == Lv
    assert Lk in {16, 32, 64, 128}
    o = torch.zeros_like(q)
    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
    num_warps = 4 if Lk <= 64 else 8  # 4
    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
    triton_dense_fwd_kernel[grid](
        q, k, v, seqlens, sm_scale,
        o,
        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
        q.shape[0], q.shape[1], q.shape[2],
        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
        BLOCK_DMODEL=Lk,
        dtype=dtype,
        num_warps=num_warps, num_stages=4,
    )

    return o


def flash_attn_forward(q, k, v, seqlens, sm_scale, context_size) -> torch.Tensor:
    return flash_attn_varlen_func(
        q,
        k,
        v,
        cu_seqlens_q=seqlens,
        cu_seqlens_k=seqlens,
        max_seqlen_q=context_size,
        max_seqlen_k=context_size,
        dropout_p=0.0,
        softmax_scale=sm_scale,
        causal=True,
    )


def torch_forward(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.Tensor,
    sm_scale: float,
) -> torch.Tensor:
    p = torch.einsum(f'bhmk, bhnk -> bhmn', query, key) * sm_scale
    p = p.where(mask, -torch.inf)
    p_max = p.max(-1, keepdim=True).values
    p_max = torch.where(p_max < 0, 0.0, p_max)
    p_exp = torch.exp(p - p_max)
    s = p_exp / (p_exp.sum(-1, keepdim=True) + 1e-6)
    out = torch.einsum(f'bhmn, bhnk -> bhmk', s, value)
    return out


def profile(fn, total_flops, tag, warmup=25, rep=100):
    ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
    gflops = total_flops / ms * 1e-9
    print(f'{tag}: {ms:.3f} ms | {gflops:.3f} GFLOP/s')


def test_flash_attention(
    seqlens=None,
    vertical_indexes=None,
    slash_indexes=None,
    dtype=torch.float16,
    device="cuda",
    torch_test=True,
    batch_size=4,
    num_heads=32,
    context_size=1024,
    head_dim=128,
    sparsity=0.995,
    block_size_M=64,
    block_size_N=64,
):
    print('========================================')
    print(f'BATCH={batch_size}, N_CTX={context_size}, N_HEADS={num_heads}, D_HEAD={head_dim}')
    q = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
    k = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
    v = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
    if seqlens is None:
        seqlens = torch.randint(context_size // 2, context_size, (batch_size, ), dtype=torch.int32, device=device)
    else:
        seqlens = torch.tensor(seqlens, dtype=torch.int32, device=device)
    dense_mask_nnz = seqlens.to(torch.float32).square().sum().item() * num_heads / 2
    sm_scale = head_dim ** -0.5

    causal_mask = make_causal_mask(seqlens, device, context_size)
    if torch_test:
        ref_o_dense = torch_forward(q, k, v, causal_mask, sm_scale)

    if vertical_indexes is None or slash_indexes is None:
        nnz = int((1 - sparsity) * context_size)
        vertical_indexes = torch.stack([
            torch.stack([
                torch.randperm(seqlen, dtype=torch.int32, device=device)[:nnz].sort(descending=False)[0]
                for _ in range(num_heads)
            ])
            for seqlen in seqlens
        ])
        slash_indexes = torch.concatenate([
            torch.stack([
                torch.stack([
                    torch.randperm(seqlen - 1, dtype=torch.int32, device=device)[:nnz].sort(descending=True)[0] + 1
                    for _ in range(num_heads)
                ])
                for seqlen in seqlens
            ]),
            torch.zeros((batch_size, num_heads, 1), dtype=torch.int32, device=device)
        ], dim=-1)
    col_count, col_index = pycuda_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M)
    if torch_test:
        col_count_ref, col_index_ref = torch_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M)
        # import ipdb; ipdb.set_trace()
        torch.testing.assert_close(col_count_ref, col_count)
        torch.testing.assert_close(col_index_ref, col_index)
    sparse_mask_nnz = col_count.to(torch.float32).sum().item() * block_size_M
    print(f'block mask sparsity: {1 - sparse_mask_nnz / dense_mask_nnz}')
    pycuda_build_index_fn = lambda: pycuda_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M)
    profile(pycuda_build_index_fn, 0., 'pycuda-index')

    if torch_test:
        finegrained_mask = make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device)
        block_mask = make_block_mask(col_count, col_index, seqlens, causal_mask, device, block_size_M)
        # plot_mask(finegrained_mask, 'mask.png', 2, 26)
        # plot_mask(block_mask, 'mask-1.png', 2, 26)
        ref_o_sparse = torch_forward(q, k, v, block_mask, sm_scale)

    triton_dense_fn = lambda: triton_dense_forward(q, k, v, seqlens, sm_scale)
    output = triton_dense_fn()
    if torch_test:
        torch.testing.assert_close(output, ref_o_dense, atol=1e-2, rtol=0)
    profile(triton_dense_fn, 2. * head_dim * dense_mask_nnz, 'triton-dense')

    triton_sparse_fn = lambda: triton_sparse_forward(q, k, v, seqlens, col_count, col_index, sm_scale, block_size_M, block_size_N)
    output = triton_sparse_fn()
    if torch_test:
        torch.testing.assert_close(output, ref_o_sparse, atol=1e-2, rtol=0)
    profile(triton_sparse_fn, 2. * head_dim * sparse_mask_nnz, 'triton-sparse')

    q = q.swapaxes(1, 2).contiguous()
    k = k.swapaxes(1, 2).contiguous()
    v = v.swapaxes(1, 2).contiguous()
    q = torch.concatenate([q[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
    k = torch.concatenate([k[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
    v = torch.concatenate([v[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
    seqlens = torch.nn.functional.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))

    flash_fn = lambda: flash_attn_forward(q, k, v, seqlens, sm_scale, context_size)
    output = flash_fn()
    output = torch.stack([
        torch.nn.functional.pad(
            output[seqlens[i]:seqlens[i + 1], :, :],
            (0, 0, 0, 0, 0, context_size + seqlens[i] - seqlens[i + 1])
        )
        for i in range(batch_size)
    ]).swapaxes(1, 2).contiguous()
    if torch_test:
        torch.testing.assert_close(output, ref_o_dense, atol=1e-2, rtol=0)
    profile(flash_fn, 2. * head_dim * dense_mask_nnz, 'flash-dense')
    print('========================================\n')


def pit_sparse_flash_attention_forward(
    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
    key: torch.Tensor,    # [BATCH, N_HEADS, N_CTX, D_HEAD]
    value: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
    v_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
    s_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
    block_size_M: int = 64,
    block_size_N: int = 64,
):
    q_len = query.shape[2]
    pad = block_size_M - (query.shape[2] & (block_size_M - 1))
    query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
    key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
    value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
    batch_size, num_heads, context_size, head_dim = query.shape
    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
    seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
    sm_scale = head_dim ** -0.5
    col_count, col_index = pycuda_build_index(seqlens, v_idx, s_idx, block_size_M)
    out = triton_sparse_forward(query, key, value, seqlens, col_count, col_index, sm_scale, block_size_M, block_size_N)[...,:q_len,:]
    return out