Spaces:

microsoft
/

MInference

Running on Zero

App Files Files Community

MInference / minference /ops /pit_sparse_flash_attention_v2.py

iofu728

Feature(MInference): fix the func name

b215053 about 1 month ago

raw

history blame

No virus

30.3 kB

	import torch
	import numpy as np
	import math

	import triton
	import triton.language as tl
	import pycuda.autoprimaryctx
	from pycuda.compiler import SourceModule

	from flash_attn import flash_attn_varlen_func


	# @triton.autotune(
	# configs=[
	# triton.Config({}, num_stages=1, num_warps=4),
	# triton.Config({}, num_stages=1, num_warps=8),
	# triton.Config({}, num_stages=2, num_warps=4),
	# triton.Config({}, num_stages=2, num_warps=8),
	# triton.Config({}, num_stages=3, num_warps=4),
	# triton.Config({}, num_stages=3, num_warps=8),
	# triton.Config({}, num_stages=4, num_warps=4),
	# triton.Config({}, num_stages=4, num_warps=8),
	# triton.Config({}, num_stages=5, num_warps=4),
	# triton.Config({}, num_stages=5, num_warps=8),
	# ],
	# key=['N_CTX'],
	# )
	@triton.jit
	def triton_sparse_fwd_kernel(
	Q, K, V, seqlens, sm_scale,
	block_count, block_offset, column_count, column_index,
	Out,
	stride_qz, stride_qh, stride_qm, stride_qk,
	stride_kz, stride_kh, stride_kn, stride_kk,
	stride_vz, stride_vh, stride_vn, stride_vk,
	stride_oz, stride_oh, stride_om, stride_ok,
	Z, H, N_CTX,
	NUM_ROWS, NNZ_S, NNZ_V,
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_DMODEL: tl.constexpr,
	dtype: tl.constexpr,
	):
	start_m = tl.program_id(0)
	off_hz = tl.program_id(1)

	seqlen = tl.load(seqlens + off_hz // H)
	if start_m * BLOCK_M >= seqlen:
	return

	# initialize offsets
	offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
	offs_n = tl.arange(0, BLOCK_N)
	offs_d = tl.arange(0, BLOCK_DMODEL)

	qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
	kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh

	q_ptrs = Q + qo_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
	k_ptrs = K + kv_offset + offs_d[:, None] * stride_kk
	v_ptrs = V + kv_offset + offs_d[None, :] * stride_vk
	o_ptrs = Out + qo_offset + offs_m[:, None] * stride_om + offs_d[None, :] * stride_ok

	num_blks = tl.load(block_count + off_hz * NUM_ROWS + start_m)
	blks_ptr = block_offset + (off_hz * NUM_ROWS + start_m) * NNZ_S
	num_cols = tl.load(column_count + off_hz * NUM_ROWS + start_m)
	cols_ptr = column_index + (off_hz * NUM_ROWS + start_m) * NNZ_V

	# initialize pointer to m and l
	m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
	l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
	acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
	# scale sm_scale by log_2(e) and use
	# 2^x instead of exp in the loop because CSE and LICM
	# don't work as expected with `exp` in the loop
	qk_scale = sm_scale * 1.44269504
	# load q: it will stay in SRAM throughout
	q = tl.load(q_ptrs)
	q = (q * qk_scale).to(dtype)

	# loop over k, v and update accumulator
	m_mask = offs_m[:, None] < seqlen

	for block_index in range(num_blks):
	start_n = tl.load(blks_ptr + block_index)
	cols = start_n + offs_n
	n_mask = cols < seqlen
	# -- load k, v --
	k = tl.load(k_ptrs + cols[None, :] * stride_kn, mask=n_mask[None, :], other=0.0)
	v = tl.load(v_ptrs + cols[:, None] * stride_vn, mask=n_mask[:, None], other=0.0)
	# -- compute qk --
	qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
	causal_mask = cols[None, :] <= offs_m[:, None]
	qk = tl.where(m_mask & causal_mask, qk, float("-inf"))
	qk += tl.dot(q, k)
	# -- compute scaling constant --
	m_i_new = tl.maximum(m_i, tl.max(qk, 1))
	alpha = tl.math.exp2(m_i - m_i_new)
	p = tl.math.exp2(qk - m_i_new[:, None])
	# -- scale and update acc --
	acc_scale = l_i * 0 + alpha # workaround some compiler bug
	acc *= acc_scale[:, None]
	acc += tl.dot(p.to(dtype), v)
	# -- update m_i and l_i --
	l_i = l_i * alpha + tl.sum(p, 1)
	m_i = m_i_new

	for start_n in range(0, num_cols, BLOCK_N):
	n_mask = start_n + offs_n < num_cols
	cols = tl.load(cols_ptr + start_n + offs_n, mask=n_mask, other=0)
	# -- load k, v --
	k = tl.load(k_ptrs + cols[None, :] * stride_kn, mask=n_mask[None, :], other=0.0)
	v = tl.load(v_ptrs + cols[:, None] * stride_vn, mask=n_mask[:, None], other=0.0)
	# -- compute qk --
	qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
	qk = tl.where(m_mask & n_mask, qk, float("-inf"))
	qk += tl.dot(q, k)
	# -- compute scaling constant --
	m_i_new = tl.maximum(m_i, tl.max(qk, 1))
	alpha = tl.math.exp2(m_i - m_i_new)
	p = tl.math.exp2(qk - m_i_new[:, None])
	# -- scale and update acc --
	acc_scale = l_i * 0 + alpha # workaround some compiler bug
	acc *= acc_scale[:, None]
	acc += tl.dot(p.to(dtype), v)
	# -- update m_i and l_i --
	l_i = l_i * alpha + tl.sum(p, 1)
	m_i = m_i_new

	# write back O
	acc /= l_i[:, None]
	# acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
	tl.store(o_ptrs, acc.to(dtype), mask=m_mask)


	def triton_sparse_forward(
	q: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD]
	k: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD]
	v: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD]
	seqlens: torch.Tensor, # [BATCH, ]
	block_count: torch.Tensor, # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
	block_offset: torch.Tensor, # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
	column_count: torch.Tensor, # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
	column_index: torch.Tensor, # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
	sm_scale: float,
	block_size_M: int = 64,
	block_size_N: int = 64,
	) -> torch.Tensor:
	# shape constraints
	Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
	assert Lq == Lk and Lk == Lv
	assert Lk in {16, 32, 64, 128}
	o = torch.zeros_like(q)
	grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
	dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
	triton_sparse_fwd_kernel[grid](
	q, k, v, seqlens, sm_scale,
	block_count, block_offset, column_count, column_index,
	o,
	q.stride(0), q.stride(1), q.stride(2), q.stride(3),
	k.stride(0), k.stride(1), k.stride(2), k.stride(3),
	v.stride(0), v.stride(1), v.stride(2), v.stride(3),
	o.stride(0), o.stride(1), o.stride(2), o.stride(3),
	q.shape[0], q.shape[1], q.shape[2],
	block_count.shape[-1], block_offset.shape[-1], column_index.shape[-1],
	BLOCK_M=block_size_M, BLOCK_N=block_size_N,
	BLOCK_DMODEL=Lk,
	dtype=dtype,
	num_warps=4, num_stages=2,
	)

	return o


	def torch_build_index(seqlens, vertical_indexes, slash_indexes, context_size, block_size_M=64, block_size_N=64):
	device = seqlens.device
	batch_size, num_heads, NNZ_S = slash_indexes.shape
	NNZ_V = vertical_indexes.shape[-1]
	num_rows = triton.cdiv(context_size, block_size_M)
	block_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32)
	block_offset = torch.zeros((batch_size, num_heads, num_rows, NNZ_S), dtype=torch.int32)
	column_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32)
	column_index = torch.zeros((batch_size, num_heads, num_rows, NNZ_V), dtype=torch.int32)

	for b in range(batch_size):
	seqlen = seqlens[b]
	for h in range(num_heads):
	for m, start_m in enumerate(range(0, seqlen, block_size_M)):
	end_m = start_m + block_size_M
	s = 0
	while slash_indexes[b, h, s] >= end_m:
	s += 1
	s_idx = max(end_m - slash_indexes[b, h, s], block_size_M)
	s += 1
	range_start = s_idx - block_size_M
	range_end = s_idx
	tmp_blocks = []
	while s < NNZ_S:
	s_idx = max(end_m - slash_indexes[b, h, s], block_size_M)
	if s_idx > range_end + block_size_M:
	tmp_blocks += list(range(range_start, range_end, block_size_N))
	range_start = s_idx - block_size_M
	range_end = s_idx
	elif s_idx > range_end:
	range_end += block_size_M
	s += 1
	tmp_blocks += list(range(range_start, range_end, block_size_N))
	block_count[b, h, m] = len(tmp_blocks)
	block_offset[b, h, m, :len(tmp_blocks)] = torch.tensor(tmp_blocks, dtype=block_offset.dtype)
	tmp_columns = vertical_indexes[b, h].cpu().numpy().tolist()
	tmp_columns = [col for col in tmp_columns if col < range_end]
	for range_start in tmp_blocks:
	range_end = range_start + block_size_N
	tmp_columns = [col for col in tmp_columns if col < range_start or col >= range_end]
	column_count[b, h, m] = len(tmp_columns)
	column_index[b, h, m, :len(tmp_columns)] = torch.tensor(tmp_columns, dtype=block_offset.dtype)

	return block_count.to(device), block_offset.to(device), column_count.to(device), column_index.to(device)


	PYCUDA_BUILD_INDEX_KERNEL_CODE = '''\
	__device__ int min(int x, int y) {
	return x < y ? x : y;
	}

	__device__ int max(int x, int y) {
	return x > y ? x : y;
	}

	__device__ void save_blocks(int* block_offset, int range_start, int range_end, int block_size, int& block_count) {
	for (int idx = range_start; idx < range_end; idx += block_size) {
	block_offset[block_count++] = idx;
	}
	}

	__global__ void PYCUDA_BUILD_INDEX_KERNEL(
	const int* seqlens, // [BATCH, ]
	const int* vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
	const int* slash_indexes, // [BATCH, N_HEADS, NNZ_S]
	int* block_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
	int* block_offset, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
	int* column_count, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
	int* column_index, // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
	int N_HEADS,
	int N_ROWS,
	int BLOCK_SIZE_M,
	int BLOCK_SIZE_N,
	int NNZ_V,
	int NNZ_S
	) {
	const int batch_idx = blockIdx.y;
	const int head_idx = blockIdx.x;
	const int group_idx = blockIdx.z;

	int seqlen = seqlens[batch_idx];
	int block_idx_m = group_idx * blockDim.x + threadIdx.x;
	int start_m = block_idx_m * BLOCK_SIZE_M;
	if (start_m >= seqlen) {
	return;
	}
	int end_m = start_m + BLOCK_SIZE_M;
	vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
	slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
	int row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
	block_count += row_offset;
	block_offset += row_offset * NNZ_S;
	column_count += row_offset;
	column_index += row_offset * NNZ_V;

	int tmp_col_cnt = 0, tmp_blk_cnt = 0;
	int s = 0, v = 0;
	int v_idx = vertical_indexes[v++];
	int s_idx = slash_indexes[s++];
	while (s_idx >= end_m) {
	s_idx = slash_indexes[s++];
	}
	s_idx = max(end_m - s_idx, BLOCK_SIZE_M);
	int range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
	while (1) {
	if (v_idx < range_end) {
	if (v_idx < range_start) {
	column_index[tmp_col_cnt++] = v_idx;
	}
	if (v < NNZ_V) {
	v_idx = vertical_indexes[v++];
	} else {
	v_idx = end_m + BLOCK_SIZE_M;
	}
	} else {
	if (s < NNZ_S) {
	s_idx = max(end_m - slash_indexes[s++], BLOCK_SIZE_M);
	} else {
	save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt);
	break;
	}
	if (s_idx > range_end + BLOCK_SIZE_M) {
	save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt);
	range_start = s_idx - BLOCK_SIZE_M;
	range_end = s_idx;
	} else if (s_idx > range_end) {
	range_end += BLOCK_SIZE_M;
	}
	}
	}

	block_count[0] = tmp_blk_cnt;
	column_count[0] = tmp_col_cnt;
	}
	'''
	PYCUDA_BUILD_INDEX_KERNEL = SourceModule(
	PYCUDA_BUILD_INDEX_KERNEL_CODE,
	options=['-std=c++14', '-O3'],
	).get_function(f'PYCUDA_BUILD_INDEX_KERNEL')


	def pycuda_build_index(seqlens, vertical_indexes, slash_indexes, context_size, block_size_M=64, block_size_N=64):
	batch_size, num_heads, NNZ_S = slash_indexes.shape
	NNZ_V = vertical_indexes.shape[-1]
	num_rows = triton.cdiv(context_size, block_size_M)
	block_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32, device=seqlens.device)
	block_offset = torch.zeros((batch_size, num_heads, num_rows, NNZ_S), dtype=torch.int32, device=seqlens.device)
	column_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32, device=seqlens.device)
	column_index = torch.zeros((batch_size, num_heads, num_rows, NNZ_V), dtype=torch.int32, device=seqlens.device)
	num_threads = 64
	# import ipdb; ipdb.set_trace()
	PYCUDA_BUILD_INDEX_KERNEL(
	seqlens, vertical_indexes, slash_indexes,
	block_count, block_offset, column_count, column_index,
	np.int32(num_heads), np.int32(num_rows),
	np.int32(block_size_M), np.int32(block_size_N),
	np.int32(NNZ_V), np.int32(NNZ_S),
	# grid=(triton.cdiv(num_rows, num_threads), N_HEADS, BATCH),
	grid=(num_heads, batch_size, triton.cdiv(num_rows, num_threads)),
	block=(num_threads, 1, 1),
	)
	return block_count, block_offset, column_count, column_index


	def make_causal_mask(seqlens, device, context_size):
	batch_size = seqlens.shape[0]
	arange = torch.arange(context_size, dtype=torch.int32, device=device)
	causal_mask = arange[None, None, :, None] >= arange[None, None, None, :]
	causal_mask = causal_mask.repeat((batch_size, 1, 1, 1))
	for b, seqlen in enumerate(seqlens):
	causal_mask[b, :, seqlen:, :] = False
	causal_mask[b, :, :, seqlen:] = False
	return causal_mask


	def make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device):
	batch_size, num_heads, _ = vertical_indexes.shape
	context_size = causal_mask.shape[-1]
	arange = torch.arange(context_size, dtype=torch.int32, device=device)
	sparse_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
	for b in range(batch_size):
	for h in range(num_heads):
	for vertical_index in vertical_indexes[b, h]:
	sparse_mask[b, h, :, vertical_index] = True
	for slash_index in slash_indexes[b, h]:
	sparse_mask[b, h].logical_or_(arange[:, None] - arange[None, :] == slash_index)
	sparse_mask.logical_and_(causal_mask)
	return sparse_mask


	def make_block_mask(
	block_count: torch.Tensor,
	block_offset: torch.Tensor,
	column_count: torch.Tensor,
	column_index: torch.Tensor,
	seqlens: torch.Tensor,
	causal_mask: torch.Tensor,
	device: torch.device,
	block_size_M: int = 64,
	block_size_N: int = 64.
	):
	batch_size, num_heads, _ = block_count.shape
	context_size = causal_mask.shape[-1]
	block_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
	for b in range(batch_size):
	for h in range(num_heads):
	for m, start_m in enumerate(range(0, seqlens[b], block_size_M)):
	end_m = start_m + block_size_M
	for col_idx in range(column_count[b, h, m]):
	block_mask[b, h, start_m:end_m, column_index[b, h, m, col_idx]] = True
	for blk_idx in range(block_count[b, h, m]):
	blk_start = block_offset[b, h, m, blk_idx].item()
	blk_end = blk_start + block_size_N
	block_mask[b, h, start_m:end_m, blk_start:blk_end] = True
	block_mask.logical_and_(causal_mask)
	return block_mask


	def plot_mask(mask, name, batch=0, head=0):
	import matplotlib.pyplot as plt
	import seaborn as sns
	plt.figure(figsize=(16, 12))
	plt.clf()
	mask = mask[batch, head].cpu().numpy()
	sns.heatmap(mask)
	plt.savefig(name)


	@triton.jit
	def triton_dense_fwd_kernel(
	Q, K, V, seqlens, sm_scale,
	Out,
	stride_qz, stride_qh, stride_qm, stride_qk,
	stride_kz, stride_kh, stride_kn, stride_kk,
	stride_vz, stride_vh, stride_vn, stride_vk,
	stride_oz, stride_oh, stride_om, stride_ok,
	Z, H, N_CTX,
	BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
	BLOCK_N: tl.constexpr,
	dtype: tl.constexpr,
	):
	start_m = tl.program_id(0)
	off_hz = tl.program_id(1)

	seqlen = tl.load(seqlens + off_hz // H)
	if start_m * BLOCK_M >= seqlen:
	return

	qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
	kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
	Q_block_ptr = tl.make_block_ptr(
	base=Q + qo_offset,
	shape=(N_CTX, BLOCK_DMODEL),
	strides=(stride_qm, stride_qk),
	offsets=(start_m * BLOCK_M, 0),
	block_shape=(BLOCK_M, BLOCK_DMODEL),
	order=(1, 0)
	)
	K_block_ptr = tl.make_block_ptr(
	base=K + kv_offset,
	shape=(BLOCK_DMODEL, N_CTX),
	strides=(stride_kk, stride_kn),
	offsets=(0, 0),
	block_shape=(BLOCK_DMODEL, BLOCK_N),
	order=(0, 1)
	)
	V_block_ptr = tl.make_block_ptr(
	base=V + kv_offset,
	shape=(N_CTX, BLOCK_DMODEL),
	strides=(stride_vn, stride_vk),
	offsets=(0, 0),
	block_shape=(BLOCK_N, BLOCK_DMODEL),
	order=(1, 0)
	)
	# initialize offsets
	offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
	offs_n = tl.arange(0, BLOCK_N)
	# initialize pointer to m and l
	m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
	l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
	acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
	# scale sm_scale by log_2(e) and use
	# 2^x instead of exp in the loop because CSE and LICM
	# don't work as expected with `exp` in the loop
	qk_scale = sm_scale * 1.44269504
	# load q: it will stay in SRAM throughout
	q = tl.load(Q_block_ptr)
	q = (q * qk_scale).to(dtype)
	# loop over k, v and update accumulator
	lo = 0
	hi = (start_m + 1) * BLOCK_M
	m_mask = offs_m[:, None] < seqlen

	for start_n in range(lo, hi, BLOCK_N):
	n_mask = (start_n + offs_n[None, :]) <= offs_m[:, None]
	# -- load k, v --
	k = tl.load(K_block_ptr)
	v = tl.load(V_block_ptr)
	# -- compute qk --
	qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
	qk = tl.where(m_mask & n_mask, qk, float("-inf"))
	qk += tl.dot(q, k)
	# -- compute scaling constant --
	m_i_new = tl.maximum(m_i, tl.max(qk, 1))
	alpha = tl.math.exp2(m_i - m_i_new)
	p = tl.math.exp2(qk - m_i_new[:, None])
	# -- scale and update acc --
	acc_scale = l_i * 0 + alpha # workaround some compiler bug
	acc *= acc_scale[:, None]
	acc += tl.dot(p.to(dtype), v)
	# -- update m_i and l_i --
	l_i = l_i * alpha + tl.sum(p, 1)
	m_i = m_i_new
	# update pointers
	K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
	V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
	# write back O
	acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
	O_block_ptr = tl.make_block_ptr(
	base=Out + qo_offset,
	shape=(N_CTX, BLOCK_DMODEL),
	strides=(stride_om, stride_ok),
	offsets=(start_m * BLOCK_M, 0),
	block_shape=(BLOCK_M, BLOCK_DMODEL),
	order=(1, 0)
	)
	tl.store(O_block_ptr, acc.to(dtype))


	def triton_dense_forward(q, k, v, seqlens, sm_scale, block_size_M=128, block_size_N=64) -> torch.Tensor:
	# shape constraints
	Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
	assert Lq == Lk and Lk == Lv
	assert Lk in {16, 32, 64, 128}
	o = torch.zeros_like(q)
	grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
	num_warps = 4 if Lk <= 64 else 8 # 4
	dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
	triton_dense_fwd_kernel[grid](
	q, k, v, seqlens, sm_scale,
	o,
	q.stride(0), q.stride(1), q.stride(2), q.stride(3),
	k.stride(0), k.stride(1), k.stride(2), k.stride(3),
	v.stride(0), v.stride(1), v.stride(2), v.stride(3),
	o.stride(0), o.stride(1), o.stride(2), o.stride(3),
	q.shape[0], q.shape[1], q.shape[2],
	BLOCK_M=block_size_M, BLOCK_N=block_size_N,
	BLOCK_DMODEL=Lk,
	dtype=dtype,
	num_warps=num_warps, num_stages=4,
	)

	return o


	def flash_attn_forward(q, k, v, seqlens, sm_scale, context_size) -> torch.Tensor:
	return flash_attn_varlen_func(
	q,
	k,
	v,
	cu_seqlens_q=seqlens,
	cu_seqlens_k=seqlens,
	max_seqlen_q=context_size,
	max_seqlen_k=context_size,
	dropout_p=0.0,
	softmax_scale=sm_scale,
	causal=True,
	)


	def torch_forward(
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	mask: torch.Tensor,
	sm_scale: float,
	) -> torch.Tensor:
	p = torch.einsum(f'bhmk, bhnk -> bhmn', query, key) * sm_scale
	p = p.where(mask, -torch.inf)
	p_max = p.max(-1, keepdim=True).values
	p_max = torch.where(p_max < 0, 0.0, p_max)
	p_exp = torch.exp(p - p_max)
	s = p_exp / (p_exp.sum(-1, keepdim=True) + 1e-6)
	out = torch.einsum(f'bhmn, bhnk -> bhmk', s, value)
	return out


	def profile(fn, total_flops, tag, warmup=25, rep=100):
	ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
	gflops = total_flops / ms * 1e-9
	print(f'{tag}: {ms:.3f} ms \| {gflops:.3f} GFLOP/s')


	def test_flash_attention(
	query=None,
	key=None,
	value=None,
	seqlens=None,
	vertical_indexes=None,
	slash_indexes=None,
	dtype=torch.float16,
	device="cuda",
	torch_test=True,
	batch_size=4,
	num_heads=32,
	context_size=2048,
	head_dim=128,
	nnz_v=100,
	nnz_s=10,
	block_size_M=64,
	block_size_N=64,
	):
	print('========================================')
	if query is None and key is None and value is None:
	q = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
	k = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
	v = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
	else:
	q = torch.tensor(query, dtype=dtype, device=device)
	k = torch.tensor(key, dtype=dtype, device=device)
	v = torch.tensor(value, dtype=dtype, device=device)
	batch_size, num_heads, context_size, head_dim = q.shape
	print(f'BATCH={batch_size}, N_CTX={context_size}, N_HEADS={num_heads}, D_HEAD={head_dim}')
	if seqlens is None:
	seqlens = torch.randint(context_size // 2, context_size, (batch_size, ), dtype=torch.int32, device=device)
	else:
	seqlens = torch.tensor(seqlens, dtype=torch.int32, device=device)
	print(seqlens)
	dense_mask_nnz = seqlens.to(torch.float32).square().sum().item() * num_heads / 2
	sm_scale = head_dim ** -0.5

	if torch_test:
	causal_mask = make_causal_mask(seqlens, device, context_size)
	ref_o_dense = torch_forward(q, k, v, causal_mask, sm_scale)

	if vertical_indexes is None or slash_indexes is None:
	vertical_indexes = torch.stack([
	torch.stack([
	torch.randperm(seqlen, dtype=torch.int32, device=device)[:nnz_v].sort(descending=False)[0]
	for _ in range(num_heads)
	])
	for seqlen in seqlens
	])
	slash_indexes = torch.concatenate([
	torch.stack([
	torch.stack([
	torch.randperm(seqlen - 1, dtype=torch.int32, device=device)[:nnz_s - 1].sort(descending=True)[0] + 1
	for _ in range(num_heads)
	])
	for seqlen in seqlens
	]),
	torch.zeros((batch_size, num_heads, 1), dtype=torch.int32, device=device)
	], dim=-1)
	pycuda_build_index_fn = lambda: pycuda_build_index(
	seqlens, vertical_indexes, slash_indexes, context_size, block_size_M, block_size_N
	)
	indexes = pycuda_build_index_fn()
	block_count, block_offset, column_count, column_index = indexes
	if torch_test:
	block_count_ref, block_offset_ref, column_count_ref, column_index_ref = torch_build_index(
	seqlens, vertical_indexes, slash_indexes, context_size, block_size_M, block_size_N
	)
	torch.testing.assert_close(block_count_ref, block_count)
	torch.testing.assert_close(block_offset_ref, block_offset)
	torch.testing.assert_close(column_count_ref, column_count)
	torch.testing.assert_close(column_index_ref, column_index)
	sparse_mask_nnz = column_count.to(torch.float64).sum().item() * block_size_M + \
	block_count.to(torch.float64).sum().item() * block_size_M * block_size_N
	print(f'block mask sparsity: {1 - sparse_mask_nnz / dense_mask_nnz}')

	pycuda_build_index_fn = lambda: pycuda_build_index(
	seqlens, vertical_indexes, slash_indexes, context_size, block_size_M, block_size_N
	)
	profile(pycuda_build_index_fn, 0., 'pycuda-index')

	if torch_test:
	finegrained_mask = make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device)
	block_mask = make_block_mask(*indexes, seqlens, causal_mask, device, block_size_M, block_size_N)
	plot_mask(finegrained_mask, 'mask.png', 0, 0)
	plot_mask(block_mask, 'mask-1.png', 0, 0)
	ref_o_sparse = torch_forward(q, k, v, block_mask, sm_scale)

	triton_dense_fn = lambda: triton_dense_forward(q, k, v, seqlens, sm_scale)
	output_triton_dense = triton_dense_fn()
	if torch_test:
	# Note: not correct for context_size % block_size_M != 0
	torch.testing.assert_close(output_triton_dense, ref_o_dense, atol=1e-2, rtol=0)
	profile(triton_dense_fn, 2. * head_dim * dense_mask_nnz, 'triton-dense')

	triton_sparse_fn = lambda: triton_sparse_forward(q, k, v, seqlens, *indexes, sm_scale, block_size_M, block_size_N)
	output_triton_sparse = triton_sparse_fn()
	if torch_test:
	torch.testing.assert_close(output_triton_sparse, ref_o_sparse, atol=1e-2, rtol=0)
	profile(triton_sparse_fn, 2. * head_dim * sparse_mask_nnz, 'triton-sparse')

	q = q.swapaxes(1, 2).contiguous()
	k = k.swapaxes(1, 2).contiguous()
	v = v.swapaxes(1, 2).contiguous()
	q = torch.concatenate([q[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
	k = torch.concatenate([k[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
	v = torch.concatenate([v[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
	seqlens = torch.nn.functional.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))

	flash_fn = lambda: flash_attn_forward(q, k, v, seqlens, sm_scale, context_size)
	output_flash = flash_fn()
	output_flash = torch.stack([
	torch.nn.functional.pad(
	output_flash[seqlens[i]:seqlens[i + 1], :, :],
	(0, 0, 0, 0, 0, context_size + seqlens[i] - seqlens[i + 1])
	)
	for i in range(batch_size)
	]).swapaxes(1, 2).contiguous()
	if torch_test:
	torch.testing.assert_close(output_flash, ref_o_dense, atol=1e-2, rtol=0)
	profile(flash_fn, 2. * head_dim * dense_mask_nnz, 'flash-dense')
	print('========================================\n')

	if torch_test and sparse_mask_nnz >= dense_mask_nnz:
	torch.testing.assert_close(output_flash, output_triton_sparse, atol=1e-2, rtol=0)


	def vertical_slash_sparse_attention(
	query: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD]
	key: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD]
	value: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD]
	v_idx: torch.Tensor, # [BATCH, N_HEADS, NNZ_V]
	s_idx: torch.Tensor, # [BATCH, N_HEADS, NNZ_S]
	block_size_M: int = 64,
	block_size_N: int = 64,
	):
	batch_size, num_heads, context_size, head_dim = query.shape
	pad = block_size_M - (context_size & (block_size_M - 1))
	query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
	key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
	value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])

	if head_dim not in [16, 32, 64, 128, 256, 512]:
	target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
	query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
	key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
	value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])

	v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
	s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
	seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
	sm_scale = head_dim ** -0.5
	block_count, block_offset, column_count, column_index = pycuda_build_index(
	seqlens, v_idx, s_idx, context_size, block_size_M, block_size_N,
	)
	# if context_size > 700000:
	# import ipdb; ipdb.set_trace()
	# dense_mask_nnz = seqlens.to(torch.float32).square().sum().item() * num_heads / 2
	# sparse_mask_nnz = column_count.to(torch.float64).sum().item() * block_size_M + \
	# block_count.to(torch.float64).sum().item() * block_size_M * block_size_N
	# print(f'block mask sparsity: {1 - sparse_mask_nnz / dense_mask_nnz}')
	out = triton_sparse_forward(
	query, key, value, seqlens,
	block_count, block_offset, column_count, column_index,
	sm_scale, block_size_M, block_size_N,
	)
	return out[..., :context_size, :head_dim]