medmekk HF Staff commited on 5 days ago

Commit

85c0263

1 Parent(s): 916c2f7

Upstream builds

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build.toml +2 -1
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/__init__.py +0 -14
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py +0 -326
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py +0 -338
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py +0 -659
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py +0 -1166
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py +0 -389
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py +0 -0
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py +0 -2012
build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py +0 -1884
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/__init__.py +0 -14
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py +0 -326
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py +0 -338
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py +0 -659
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py +0 -1166
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py +0 -389
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py +0 -0
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py +0 -2012
build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py +0 -1884
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/__init__.py +0 -14
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/distributed/__init__.py +0 -0
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py +0 -326
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/models/__init__.py +0 -0
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py +0 -338
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/modules/__init__.py +0 -0
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/__init__.py +0 -0
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py +0 -659
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/__init__.py +0 -0
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py +0 -1166
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py +0 -389
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py +0 -0
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py +0 -2012
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py +0 -1884
build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/utils/__init__.py +0 -0
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/__init__.py +0 -14
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/distributed/__init__.py +0 -0
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py +0 -326
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/models/__init__.py +0 -0
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py +0 -338
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/modules/__init__.py +0 -0
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/__init__.py +0 -0
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py +0 -659
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/__init__.py +0 -0
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py +0 -1166
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py +0 -389
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py +0 -0
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py +0 -2012
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py +0 -1884
build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/utils/__init__.py +0 -0
build/torch25-cxx98-cu121-x86_64-linux/mamba_ssm/__init__.py +0 -14

build.toml CHANGED Viewed

@@ -1,6 +1,7 @@
 [general]
 name = "mamba_ssm"
-universal = false
 [torch]
 src = [

 [general]
 name = "mamba_ssm"
+backends = ["cuda"]
+python-depends = ["einops"]
 [torch]
 src = [

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-__version__ = "2.2.4"
-from .ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-from .modules.mamba_simple import Mamba
-from .modules.mamba2 import Mamba2
-from .models.mixer_seq_simple import MambaLMHeadModel
-__all__ = [
-    "selective_scan_fn",
-    "mamba_inner_fn",
-    "Mamba",
-    "Mamba2",
-    "MambaLMHeadModel",
-]

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py DELETED Viewed

@@ -1,326 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from ..utils.torch import custom_bwd, custom_fwd
-from einops import rearrange
-from ..distributed.distributed_utils import (
-    all_gather_raw,
-    all_reduce,
-    all_reduce_raw,
-    reduce_scatter,
-    reduce_scatter_raw,
-)
-class ParallelLinearFunc(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
-        """
-        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
-        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
-        """
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.process_group = process_group
-        ctx.sequence_parallel = sequence_parallel
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        if process_group is not None and sequence_parallel:
-            # We want to kick off the all_gather early, before weight dtype conversion
-            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-        else:
-            total_x = x
-        if torch.is_autocast_enabled():
-            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
-            bias = (
-                bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if bias is not None
-                else None
-            )
-        weight = weight.contiguous()
-        if process_group is not None and sequence_parallel:
-            handle_x.wait()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        output = F.linear(total_x, weight, bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        grad_output = grad_output.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-            else:
-                total_x = x
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            grad_input = F.linear(grad_output, weight.t())
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(
-                    grad_input, process_group, async_op=True
-                )
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            if process_group is not None and sequence_parallel:
-                handle_x.wait()
-            grad_weight = torch.einsum(
-                "bo,bi->oi", grad_output, total_x.reshape(batch_dim, total_x.shape[-1])
-            )
-        else:
-            grad_weight = None
-        grad_bias = grad_output.sum(dim=0) if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None
-def parallel_linear_func(
-    x: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    process_group: Optional[ProcessGroup] = None,
-    sequence_parallel: bool = True,
-):
-    return ParallelLinearFunc.apply(x, weight, bias, process_group, sequence_parallel)
-class ColumnParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        if out_features % multiple_of:
-            raise ValueError(
-                f"out_features ({out_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = out_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        super().__init__(
-            in_features,
-            local_multiple * multiple_of,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        return parallel_linear_func(
-            x,
-            self.weight,
-            self.bias,
-            process_group=self.process_group,
-            sequence_parallel=self.sequence_parallel,
-        )
-class RowParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        rank = torch.distributed.get_rank(process_group)
-        if in_features % multiple_of:
-            raise ValueError(
-                f"in_features ({in_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = in_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        # Only rank 0 will have bias
-        super().__init__(
-            local_multiple * multiple_of,
-            out_features,
-            bias=bias and rank == 0,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        """
-        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
-        a reduce_scatter of the result.
-        """
-        out = parallel_linear_func(x, self.weight, self.bias)
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return reduce_fn(out, self.process_group)
-class VocabParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if num_embeddings % world_size != 0:
-                raise ValueError(
-                    f"num_embeddings ({num_embeddings}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-            if world_size > 1 and padding_idx is not None:
-                raise RuntimeError("ParallelEmbedding does not support padding_idx")
-        else:
-            world_size = 1
-        super().__init__(
-            num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs
-        )
-    def forward(self, input: Tensor) -> Tensor:
-        if self.process_group is None:
-            return super().forward(input)
-        else:
-            rank = torch.distributed.get_rank(self.process_group)
-            vocab_size = self.num_embeddings
-            vocab_start_index, vocab_end_index = (
-                rank * vocab_size,
-                (rank + 1) * vocab_size,
-            )
-            # Create a mask of valid vocab ids (1 means it needs to be masked).
-            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
-            input = input - vocab_start_index
-            input[input_ids_mask] = 0
-            embeddings = super().forward(input)
-            embeddings[input_ids_mask] = 0.0
-            return embeddings
-class ColumnParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if embedding_dim % world_size != 0:
-                raise ValueError(
-                    f"embedding_dim ({embedding_dim}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-        else:
-            world_size = 1
-        super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs)
-class ParallelEmbeddings(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        vocab_size,
-        max_position_embeddings,
-        process_group,
-        padding_idx=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        """
-        If max_position_embeddings <= 0, there's no position embeddings
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.word_embeddings = VocabParallelEmbedding(
-            vocab_size,
-            embed_dim,
-            padding_idx=padding_idx,
-            process_group=process_group,
-            **factory_kwargs,
-        )
-        self.max_position_embeddings = max_position_embeddings
-        if self.max_position_embeddings > 0:
-            self.position_embeddings = ColumnParallelEmbedding(
-                max_position_embeddings,
-                embed_dim,
-                process_group=process_group,
-                **factory_kwargs,
-            )
-    def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False):
-        """
-        input_ids: (batch, seqlen)
-        position_ids: (batch, seqlen)
-        """
-        batch_size, seqlen = input_ids.shape
-        world_size = torch.distributed.get_world_size(self.process_group)
-        embeddings = self.word_embeddings(input_ids)
-        if self.max_position_embeddings > 0:
-            if position_ids is None:
-                position_ids = torch.arange(
-                    seqlen, dtype=torch.long, device=input_ids.device
-                )
-            position_embeddings = self.position_embeddings(position_ids)
-            if world_size <= 1:
-                embeddings = embeddings + position_embeddings
-            else:
-                partition_dim = self.position_embeddings.embedding_dim
-                rank = torch.distributed.get_rank(self.process_group)
-                embeddings[
-                    ..., rank * partition_dim : (rank + 1) * partition_dim
-                ] += position_embeddings
-        if combine_batch_seqlen_dim:
-            embeddings = rearrange(embeddings, "b s d -> (b s) d")
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return (
-            embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group)
-        )

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py DELETED Viewed

@@ -1,338 +0,0 @@
-# Copyright (c) 2023, Albert Gu, Tri Dao.
-import math
-from functools import partial
-import json
-import os
-import copy
-from collections import namedtuple
-import torch
-import torch.nn as nn
-from .config_mamba import MambaConfig
-from ..modules.mamba_simple import Mamba
-from ..modules.mamba2 import Mamba2
-from ..modules.mha import MHA
-from ..modules.mlp import GatedMLP
-from ..modules.block import Block
-from ..utils.generation import GenerationMixin
-from ..utils.hf import load_config_hf, load_state_dict_hf
-try:
-    from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-def create_block(
-    d_model,
-    d_intermediate,
-    ssm_cfg=None,
-    attn_layer_idx=None,
-    attn_cfg=None,
-    norm_epsilon=1e-5,
-    rms_norm=False,
-    residual_in_fp32=False,
-    fused_add_norm=False,
-    layer_idx=None,
-    device=None,
-    dtype=None,
-):
-    if ssm_cfg is None:
-        ssm_cfg = {}
-    if attn_layer_idx is None:
-        attn_layer_idx = []
-    if attn_cfg is None:
-        attn_cfg = {}
-    factory_kwargs = {"device": device, "dtype": dtype}
-    if layer_idx not in attn_layer_idx:
-        # Create a copy of the config to modify
-        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
-        ssm_layer = ssm_cfg.pop("layer", "Mamba1")
-        if ssm_layer not in ["Mamba1", "Mamba2"]:
-            raise ValueError(
-                f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2"
-            )
-        mixer_cls = partial(
-            Mamba2 if ssm_layer == "Mamba2" else Mamba,
-            layer_idx=layer_idx,
-            **ssm_cfg,
-            **factory_kwargs,
-        )
-    else:
-        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
-    norm_cls = partial(
-        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
-    )
-    if d_intermediate == 0:
-        mlp_cls = nn.Identity
-    else:
-        mlp_cls = partial(
-            GatedMLP,
-            hidden_features=d_intermediate,
-            out_features=d_model,
-            **factory_kwargs,
-        )
-    block = Block(
-        d_model,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        fused_add_norm=fused_add_norm,
-        residual_in_fp32=residual_in_fp32,
-    )
-    block.layer_idx = layer_idx
-    return block
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    if isinstance(module, nn.Linear):
-        if module.bias is not None:
-            if not getattr(module.bias, "_no_reinit", False):
-                nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                # We need to reinit p since this code could be called multiple times
-                # Having just p *= scale would repeatedly scale it down
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                with torch.no_grad():
-                    p /= math.sqrt(n_residuals_per_layer * n_layer)
-class MixerModel(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        d_intermediate: int,
-        vocab_size: int,
-        ssm_cfg=None,
-        attn_layer_idx=None,
-        attn_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    d_intermediate=d_intermediate,
-                    ssm_cfg=ssm_cfg,
-                    attn_layer_idx=attn_layer_idx,
-                    attn_cfg=attn_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-                n_residuals_per_layer=(
-                    1 if d_intermediate == 0 else 2
-                ),  # 2 if we have MLP
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(
-                batch_size, max_seqlen, dtype=dtype, **kwargs
-            )
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
-        hidden_states = self.embedding(input_ids)
-        residual = None
-        for layer in self.layers:
-            hidden_states, residual = layer(
-                hidden_states,
-                residual,
-                inference_params=inference_params,
-                **mixer_kwargs,
-            )
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            # Set prenorm=False here since we don't need the residual
-            hidden_states = layer_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm_f, RMSNorm),
-            )
-        return hidden_states
-class MambaLMHeadModel(nn.Module, GenerationMixin):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        d_intermediate = config.d_intermediate
-        vocab_size = config.vocab_size
-        ssm_cfg = config.ssm_cfg
-        attn_layer_idx = config.attn_layer_idx
-        attn_cfg = config.attn_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.backbone = MixerModel(
-            d_model=d_model,
-            n_layer=n_layer,
-            d_intermediate=d_intermediate,
-            vocab_size=vocab_size,
-            ssm_cfg=ssm_cfg,
-            attn_layer_idx=attn_layer_idx,
-            attn_cfg=attn_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        if self.config.tie_embeddings:
-            self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        **mixer_kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        hidden_states = self.backbone(
-            input_ids, inference_params=inference_params, **mixer_kwargs
-        )
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(
-            load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype)
-        )
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.__dict__, f, indent=4)

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py DELETED Viewed

@@ -1,659 +0,0 @@
-# Copyright (c) 2023, Tri Dao, Albert Gu.
-import torch
-import torch.nn.functional as F
-from ..utils.torch import custom_fwd, custom_bwd
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn = None
-    causal_conv1d_cuda = None
-from .triton.layer_norm import _layer_norm_fwd
-from .._ops import ops
-class SelectiveScanFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        u,
-        delta,
-        A,
-        B,
-        C,
-        D=None,
-        z=None,
-        delta_bias=None,
-        delta_softplus=False,
-        return_last_state=False,
-    ):
-        if u.stride(-1) != 1:
-            u = u.contiguous()
-        if delta.stride(-1) != 1:
-            delta = delta.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if C.stride(-1) != 1:
-            C = C.contiguous()
-        if z is not None and z.stride(-1) != 1:
-            z = z.contiguous()
-        if B.dim() == 3:
-            B = rearrange(B, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_B = True
-        if C.dim() == 3:
-            C = rearrange(C, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_C = True
-        out, x, *rest = ops.selective_scan_fwd(
-            u, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.has_z = z is not None
-        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
-        if not ctx.has_z:
-            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
-            return out if not return_last_state else (out, last_state)
-        else:
-            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
-            out_z = rest[0]
-            return out_z if not return_last_state else (out_z, last_state)
-    @staticmethod
-    def backward(ctx, dout, *args):
-        if not ctx.has_z:
-            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
-            z = None
-            out = None
-        else:
-            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        # Here we just pass in None and dz will be allocated in the C++ code.
-        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = ops.selective_scan_bwd(
-            u,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            z,
-            delta_bias,
-            dout,
-            x,
-            out,
-            None,
-            ctx.delta_softplus,
-            False,  # option to recompute out_z, not used here
-        )
-        dz = rest[0] if ctx.has_z else None
-        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
-        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
-        return (
-            du,
-            ddelta,
-            dA,
-            dB,
-            dC,
-            dD if D is not None else None,
-            dz,
-            ddelta_bias if delta_bias is not None else None,
-            None,
-            None,
-        )
-def rms_norm_forward(
-    x,
-    weight,
-    bias,
-    eps=1e-6,
-    is_rms_norm=True,
-):
-    # x (b l) d
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    weight = weight.contiguous()
-    if bias is not None:
-        bias = bias.contiguous()
-    y = _layer_norm_fwd(
-        x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm
-    )[0]
-    # y (b l) d
-    return y
-def selective_scan_fn(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """if return_last_state is True, returns (out, last_state)
-    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
-    not considered in the backward pass.
-    """
-    return SelectiveScanFn.apply(
-        u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state
-    )
-def selective_scan_ref(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """
-    u: r(B D L)
-    delta: r(B D L)
-    A: c(D N) or r(D N)
-    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    D: r(D)
-    z: r(B D L)
-    delta_bias: r(D), fp32
-    out: r(B D L)
-    last_state (optional): r(B D dstate) or c(B D dstate)
-    """
-    dtype_in = u.dtype
-    u = u.float()
-    delta = delta.float()
-    if delta_bias is not None:
-        delta = delta + delta_bias[..., None].float()
-    if delta_softplus:
-        delta = F.softplus(delta)
-    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
-    is_variable_B = B.dim() >= 3
-    is_variable_C = C.dim() >= 3
-    if A.is_complex():
-        if is_variable_B:
-            B = torch.view_as_complex(
-                rearrange(B.float(), "... (L two) -> ... L two", two=2)
-            )
-        if is_variable_C:
-            C = torch.view_as_complex(
-                rearrange(C.float(), "... (L two) -> ... L two", two=2)
-            )
-    else:
-        B = B.float()
-        C = C.float()
-    x = A.new_zeros((batch, dim, dstate))
-    ys = []
-    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
-    if not is_variable_B:
-        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
-    else:
-        if B.dim() == 3:
-            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
-        else:
-            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
-    if is_variable_C and C.dim() == 4:
-        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
-    last_state = None
-    for i in range(u.shape[2]):
-        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
-        if not is_variable_C:
-            y = torch.einsum("bdn,dn->bd", x, C)
-        else:
-            if C.dim() == 3:
-                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
-            else:
-                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
-        if i == u.shape[2] - 1:
-            last_state = x
-        if y.is_complex():
-            y = y.real * 2
-        ys.append(y)
-    y = torch.stack(ys, dim=2)  # (batch dim L)
-    out = y if D is None else y + u * rearrange(D, "d -> d 1")
-    if z is not None:
-        out = out * F.silu(z)
-    out = out.to(dtype=dtype_in)
-    return out if not return_last_state else (out, last_state)
-class MambaInnerFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B=None,
-        C=None,
-        D=None,
-        delta_bias=None,
-        B_proj_bias=None,
-        C_proj_bias=None,
-        delta_softplus=True,
-        checkpoint_lvl=1,
-        b_rms_weight=None,
-        c_rms_weight=None,
-        dt_rms_weight=None,
-        b_c_dt_rms_eps=1e-6,
-    ):
-        """
-        xz: (batch, dim, seqlen)
-        """
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        assert checkpoint_lvl in [0, 1]
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        if torch.is_autocast_enabled():
-            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            delta_proj_weight = delta_proj_weight.to(
-                dtype=torch.get_autocast_gpu_dtype()
-            )
-            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            out_proj_bias = (
-                out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if out_proj_bias is not None
-                else None
-            )
-        if xz.stride(-1) != 1:
-            xz = xz.contiguous()
-        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
-        x, z = xz.chunk(2, dim=1)
-        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
-        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-            x, conv1d_weight, conv1d_bias, None, None, None, True
-        )
-        # We're being very careful here about the layout, to avoid extra transposes.
-        # We want delta to have d as the slowest moving dimension
-        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-        x_dbl = F.linear(
-            rearrange(conv1d_out, "b d l -> (b l) d"), x_proj_weight
-        )  # (bl d)
-        delta = rearrange(
-            delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-        )
-        ctx.is_variable_B = B is None
-        ctx.is_variable_C = C is None
-        ctx.B_proj_bias_is_None = B_proj_bias is None
-        ctx.C_proj_bias_is_None = C_proj_bias is None
-        if B is None:  # variable B
-            B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl dstate)
-            if B_proj_bias is not None:
-                B = B + B_proj_bias.to(dtype=B.dtype)
-            if not A.is_complex():
-                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                B = rearrange(
-                    B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if B.stride(-1) != 1:
-                B = B.contiguous()
-        if C is None:  # variable C
-            C = x_dbl[:, -d_state:]  # (bl dstate)
-            if C_proj_bias is not None:
-                C = C + C_proj_bias.to(dtype=C.dtype)
-            if not A.is_complex():
-                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                C = rearrange(
-                    C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if C.stride(-1) != 1:
-                C = C.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if b_rms_weight is not None:
-            B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if c_rms_weight is not None:
-            C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if dt_rms_weight is not None:
-            delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-            delta = rms_norm_forward(
-                delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps
-            )
-            delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-        out, scan_intermediates, out_z = ops.selective_scan_fwd(
-            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.out_proj_bias_is_None = out_proj_bias is None
-        ctx.checkpoint_lvl = checkpoint_lvl
-        ctx.b_rms_weight = b_rms_weight
-        ctx.c_rms_weight = c_rms_weight
-        ctx.dt_rms_weight = dt_rms_weight
-        ctx.b_c_dt_rms_eps = b_c_dt_rms_eps
-        if (
-            checkpoint_lvl >= 1
-        ):  # Will recompute conv1d_out and delta in the backward pass
-            conv1d_out, delta = None, None
-        ctx.save_for_backward(
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        )
-        return F.linear(
-            rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias
-        )
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout):
-        # dout: (batch, seqlen, dim)
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        (
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        ) = ctx.saved_tensors
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        x, z = xz.chunk(2, dim=1)
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        if ctx.checkpoint_lvl == 1:
-            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-                x, conv1d_weight, conv1d_bias, None, None, None, True
-            )
-            delta = rearrange(
-                delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-            )
-            if dt_rms_weight is not None:
-                delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-                delta = rms_norm_forward(
-                    delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps
-                )
-                delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-            if b_rms_weight is not None:
-                # Recompute & RMSNorm B
-                B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                B = rms_norm_forward(B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps)
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            if c_rms_weight is not None:
-                # Recompute & RMSNorm C
-                C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                C = rms_norm_forward(C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps)
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
-        dx, dz = dxz.chunk(2, dim=1)
-        dout = rearrange(dout, "b l e -> e (b l)")
-        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
-        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = (
-            ops.selective_scan_bwd(
-                conv1d_out,
-                delta,
-                A,
-                B,
-                C,
-                D,
-                z,
-                delta_bias,
-                dout_y,
-                scan_intermediates,
-                out,
-                dz,
-                ctx.delta_softplus,
-                True,  # option to recompute out_z
-            )
-        )
-        dout_proj_weight = torch.einsum(
-            "eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)")
-        )
-        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
-        dD = dD if D is not None else None
-        dx_dbl = torch.empty_like(x_dbl)
-        dB_proj_bias = None
-        if ctx.is_variable_B:
-            if not A.is_complex():
-                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dB = rearrange(
-                    dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
-            dx_dbl[:, delta_rank : delta_rank + d_state] = dB  # (bl d)
-            dB = None
-        dC_proj_bias = None
-        if ctx.is_variable_C:
-            if not A.is_complex():
-                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dC = rearrange(
-                    dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
-            dx_dbl[:, -d_state:] = dC  # (bl d)
-            dC = None
-        ddelta = rearrange(ddelta, "b d l -> d (b l)")
-        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
-        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
-        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
-        dx_proj_weight = torch.einsum(
-            "Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d")
-        )
-        dconv1d_out = torch.addmm(
-            dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out
-        )
-        dconv1d_out = rearrange(
-            dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1]
-        )
-        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
-        # backward of conv1d with the backward of chunk).
-        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            x,
-            conv1d_weight,
-            conv1d_bias,
-            dconv1d_out,
-            None,
-            None,
-            None,
-            dx,
-            False,
-            True,
-        )
-        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
-        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
-        return (
-            dxz,
-            dconv1d_weight,
-            dconv1d_bias,
-            dx_proj_weight,
-            ddelta_proj_weight,
-            dout_proj_weight,
-            dout_proj_bias,
-            dA,
-            dB,
-            dC,
-            dD,
-            ddelta_bias if delta_bias is not None else None,
-            # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps
-            dB_proj_bias,
-            dC_proj_bias,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_inner_fn(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-    checkpoint_lvl=1,
-    b_rms_weight=None,
-    c_rms_weight=None,
-    dt_rms_weight=None,
-    b_c_dt_rms_eps=1e-6,
-):
-    return MambaInnerFn.apply(
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B,
-        C,
-        D,
-        delta_bias,
-        B_proj_bias,
-        C_proj_bias,
-        delta_softplus,
-        checkpoint_lvl,
-        b_rms_weight,
-        c_rms_weight,
-        dt_rms_weight,
-        b_c_dt_rms_eps,
-    )
-def mamba_inner_ref(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-):
-    assert (
-        causal_conv1d_fn is not None
-    ), "causal_conv1d_fn is not available. Please install causal-conv1d."
-    L = xz.shape[-1]
-    delta_rank = delta_proj_weight.shape[1]
-    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-    x, z = xz.chunk(2, dim=1)
-    x = causal_conv1d_fn(
-        x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, activation="silu"
-    )
-    # We're being very careful here about the layout, to avoid extra transposes.
-    # We want delta to have d as the slowest moving dimension
-    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-    x_dbl = F.linear(rearrange(x, "b d l -> (b l) d"), x_proj_weight)  # (bl d)
-    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
-    delta = rearrange(delta, "d (b l) -> b d l", l=L)
-    if B is None:  # variable B
-        B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl d)
-        if B_proj_bias is not None:
-            B = B + B_proj_bias.to(dtype=B.dtype)
-        if not A.is_complex():
-            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            B = rearrange(
-                B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    if C is None:  # variable B
-        C = x_dbl[:, -d_state:]  # (bl d)
-        if C_proj_bias is not None:
-            C = C + C_proj_bias.to(dtype=C.dtype)
-        if not A.is_complex():
-            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            C = rearrange(
-                C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    y = selective_scan_fn(
-        x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True
-    )
-    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py DELETED Viewed

@@ -1,1166 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# Implement dropout + residual + layer_norm / rms_norm.
-# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
-# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
-# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
-import math
-import warnings
-import torch
-import torch.nn.functional as F
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-def layer_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    out = F.layer_norm(
-        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
-    ).to(dtype)
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = F.layer_norm(
-            x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def rms_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
-        dtype
-    )
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = (
-            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def config_prune(configs):
-    if torch.version.hip:
-        try:
-            # set warp size based on gcn architecure
-            gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
-            if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
-                # radeon
-                warp_size = 32
-            else:
-                # instinct
-                warp_size = 64
-        except AttributeError as e:
-            # fall back to crude method to set warp size
-            device_name = torch.cuda.get_device_properties(0).name
-            if "instinct" in device_name.lower():
-                warp_size = 64
-            else:
-                warp_size = 32
-            warnings.warn(
-                f"{e}, warp size set to {warp_size} based on device name: {device_name}",
-                UserWarning,
-            )
-    else:
-        # cuda
-        warp_size = 32
-    max_block_sz = 1024
-    max_num_warps = max_block_sz // warp_size
-    pruned_configs = [config for config in configs if config.num_warps <= max_num_warps]
-    return pruned_configs
-configs_autotune = [
-    triton.Config({}, num_warps=1),
-    triton.Config({}, num_warps=2),
-    triton.Config({}, num_warps=4),
-    triton.Config({}, num_warps=8),
-    triton.Config({}, num_warps=16),
-    triton.Config({}, num_warps=32),
-]
-pruned_configs_autotune = config_prune(configs_autotune)
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
-@triton.jit
-def _layer_norm_fwd_1pass_kernel(
-    X,  # pointer to the input
-    Y,  # pointer to the output
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    RESIDUAL,  # pointer to the residual
-    X1,
-    W1,
-    B1,
-    Y1,
-    RESIDUAL_OUT,  # pointer to the residual
-    ROWSCALE,
-    SEEDS,  # Dropout seeds for each row
-    DROPOUT_MASK,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_res_row,
-    stride_res_out_row,
-    stride_x1_row,
-    stride_y1_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,  # Dropout probability
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_RESIDUAL: tl.constexpr,
-    STORE_RESIDUAL_OUT: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    STORE_DROPOUT_MASK: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_X1: tl.constexpr,
-    HAS_W1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    X += row * stride_x_row
-    Y += row * stride_y_row
-    if HAS_RESIDUAL:
-        RESIDUAL += row * stride_res_row
-    if STORE_RESIDUAL_OUT:
-        RESIDUAL_OUT += row * stride_res_out_row
-    if HAS_X1:
-        X1 += row * stride_x1_row
-    if HAS_W1:
-        Y1 += row * stride_y1_row
-    # Compute mean and variance
-    cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
-    if HAS_ROWSCALE:
-        rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-        x *= rowscale
-    if HAS_DROPOUT:
-        # Compute dropout mask
-        # 7 rounds is good enough, and reduces register pressure
-        keep_mask = (
-            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-        )
-        x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
-        if STORE_DROPOUT_MASK:
-            tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
-    if HAS_X1:
-        x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
-            x1 *= rowscale
-        if HAS_DROPOUT:
-            # Compute dropout mask
-            # 7 rounds is good enough, and reduces register pressure
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
-            if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
-        x += x1
-    if HAS_RESIDUAL:
-        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
-        x += residual
-    if STORE_RESIDUAL_OUT:
-        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
-    if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    else:
-        xbar = tl.where(cols < N, x, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
-    # Write output
-    tl.store(Y + cols, y, mask=mask)
-    if HAS_W1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if HAS_B1:
-            b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
-        y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
-        tl.store(Y1 + cols, y1, mask=mask)
-def _layer_norm_fwd(
-    x,
-    weight,
-    bias,
-    eps,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    dropout_p=0.0,
-    rowscale=None,
-    out_dtype=None,
-    residual_dtype=None,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    if residual is not None:
-        residual_dtype = residual.dtype
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    if residual is not None:
-        assert residual.stride(-1) == 1
-        assert residual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if x1 is not None:
-        assert x1.shape == x.shape
-        assert rowscale is None
-        assert x1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
-    assert y.stride(-1) == 1
-    if weight1 is not None:
-        y1 = torch.empty_like(y)
-        assert y1.stride(-1) == 1
-    else:
-        y1 = None
-    if (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        residual_out = torch.empty(
-            M,
-            N,
-            device=x.device,
-            dtype=residual_dtype if residual_dtype is not None else x.dtype,
-        )
-        assert residual_out.stride(-1) == 1
-    else:
-        residual_out = None
-    mean = (
-        torch.empty((M,), dtype=torch.float32, device=x.device)
-        if not is_rms_norm
-        else None
-    )
-    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
-    if dropout_p > 0.0:
-        seeds = torch.randint(
-            2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
-        )
-    else:
-        seeds = None
-    if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(
-            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
-        )
-    else:
-        dropout_mask = None
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[(M,)](
-            x,
-            y,
-            weight,
-            bias,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            y1,
-            residual_out,
-            rowscale,
-            seeds,
-            dropout_mask,
-            mean,
-            rstd,
-            x.stride(0),
-            y.stride(0),
-            residual.stride(0) if residual is not None else 0,
-            residual_out.stride(0) if residual_out is not None else 0,
-            x1.stride(0) if x1 is not None else 0,
-            y1.stride(0) if y1 is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            is_rms_norm,
-            BLOCK_N,
-            residual is not None,
-            residual_out is not None,
-            bias is not None,
-            dropout_p > 0.0,
-            dropout_mask is not None,
-            rowscale is not None,
-        )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if dropout_mask is not None and x1 is not None:
-        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
-    else:
-        dropout_mask1 = None
-    return (
-        y,
-        y1,
-        mean,
-        rstd,
-        residual_out if residual_out is not None else x,
-        seeds,
-        dropout_mask,
-        dropout_mask1,
-    )
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=[
-        "N",
-        "HAS_DRESIDUAL",
-        "STORE_DRESIDUAL",
-        "IS_RMS_NORM",
-        "HAS_BIAS",
-        "HAS_DROPOUT",
-    ],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
-# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
-@triton.jit
-def _layer_norm_bwd_kernel(
-    X,  # pointer to the input
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    Y,  # pointer to the output to be recomputed
-    DY,  # pointer to the output gradient
-    DX,  # pointer to the input gradient
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    DRESIDUAL,
-    W1,
-    DY1,
-    DX1,
-    DW1,
-    DB1,
-    DRESIDUAL_IN,
-    ROWSCALE,
-    SEEDS,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_dy_row,
-    stride_dx_row,
-    stride_dres_row,
-    stride_dy1_row,
-    stride_dx1_row,
-    stride_dres_in_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,
-    rows_per_program,
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_DRESIDUAL: tl.constexpr,
-    STORE_DRESIDUAL: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_DY1: tl.constexpr,
-    HAS_DX1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-    RECOMPUTE_OUTPUT: tl.constexpr,
-):
-    # Map the program id to the elements of X, DX, and DY it should compute.
-    row_block_id = tl.program_id(0)
-    row_start = row_block_id * rows_per_program
-    # Do not early exit if row_start >= M, because we need to write DW and DB
-    cols = tl.arange(0, BLOCK_N)
-    mask = cols < N
-    X += row_start * stride_x_row
-    if HAS_DRESIDUAL:
-        DRESIDUAL += row_start * stride_dres_row
-    if STORE_DRESIDUAL:
-        DRESIDUAL_IN += row_start * stride_dres_in_row
-    DY += row_start * stride_dy_row
-    DX += row_start * stride_dx_row
-    if HAS_DY1:
-        DY1 += row_start * stride_dy1_row
-    if HAS_DX1:
-        DX1 += row_start * stride_dx1_row
-    if RECOMPUTE_OUTPUT:
-        Y += row_start * stride_y_row
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if RECOMPUTE_OUTPUT and HAS_BIAS:
-        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
-    if HAS_DY1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_BIAS:
-        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_DY1:
-        dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-        if HAS_B1:
-            db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    row_end = min((row_block_id + 1) * rows_per_program, M)
-    for row in range(row_start, row_end):
-        # Load data to SRAM
-        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
-        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
-        if HAS_DY1:
-            dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
-        if not IS_RMS_NORM:
-            mean = tl.load(Mean + row)
-        rstd = tl.load(Rstd + row)
-        # Compute dx
-        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-        xhat = tl.where(mask, xhat, 0.0)
-        if RECOMPUTE_OUTPUT:
-            y = xhat * w + b if HAS_BIAS else xhat * w
-            tl.store(Y + cols, y, mask=mask)
-        wdy = w * dy
-        dw += dy * xhat
-        if HAS_BIAS:
-            db += dy
-        if HAS_DY1:
-            wdy += w1 * dy1
-            dw1 += dy1 * xhat
-            if HAS_B1:
-                db1 += dy1
-        if not IS_RMS_NORM:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            c2 = tl.sum(wdy, axis=0) / N
-            dx = (wdy - (xhat * c1 + c2)) * rstd
-        else:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            dx = (wdy - xhat * c1) * rstd
-        if HAS_DRESIDUAL:
-            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
-            dx += dres
-        # Write dx
-        if STORE_DRESIDUAL:
-            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
-        if HAS_DX1:
-            if HAS_DROPOUT:
-                keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                    > dropout_p
-                )
-                dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-            else:
-                dx1 = dx
-            tl.store(DX1 + cols, dx1, mask=mask)
-        if HAS_DROPOUT:
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-            dx *= rowscale
-        tl.store(DX + cols, dx, mask=mask)
-        X += stride_x_row
-        if HAS_DRESIDUAL:
-            DRESIDUAL += stride_dres_row
-        if STORE_DRESIDUAL:
-            DRESIDUAL_IN += stride_dres_in_row
-        if RECOMPUTE_OUTPUT:
-            Y += stride_y_row
-        DY += stride_dy_row
-        DX += stride_dx_row
-        if HAS_DY1:
-            DY1 += stride_dy1_row
-        if HAS_DX1:
-            DX1 += stride_dx1_row
-    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
-    if HAS_BIAS:
-        tl.store(DB + row_block_id * N + cols, db, mask=mask)
-    if HAS_DY1:
-        tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
-        if HAS_B1:
-            tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
-def _layer_norm_bwd(
-    dy,
-    x,
-    weight,
-    bias,
-    eps,
-    mean,
-    rstd,
-    dresidual=None,
-    dy1=None,
-    weight1=None,
-    bias1=None,
-    seeds=None,
-    dropout_p=0.0,
-    rowscale=None,
-    has_residual=False,
-    has_x1=False,
-    is_rms_norm=False,
-    x_dtype=None,
-    recompute_output=False,
-):
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    assert dy.stride(-1) == 1
-    assert dy.shape == (M, N)
-    if dresidual is not None:
-        assert dresidual.stride(-1) == 1
-        assert dresidual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if dy1 is not None:
-        assert weight1 is not None
-        assert dy1.shape == dy.shape
-        assert dy1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if seeds is not None:
-        assert seeds.is_contiguous()
-        assert seeds.shape == (M if not has_x1 else M * 2,)
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    dx = (
-        torch.empty_like(x)
-        if x_dtype is None
-        else torch.empty(M, N, dtype=x_dtype, device=x.device)
-    )
-    dresidual_in = (
-        torch.empty_like(x)
-        if has_residual
-        and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
-        else None
-    )
-    dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = (
-        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
-        if recompute_output
-        else None
-    )
-    if recompute_output:
-        assert (
-            weight1 is None
-        ), "recompute_output is not supported with parallel LayerNorm"
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
-    _db = (
-        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
-        if bias is not None
-        else None
-    )
-    _dw1 = torch.empty_like(_dw) if weight1 is not None else None
-    _db1 = torch.empty_like(_db) if bias1 is not None else None
-    rows_per_program = math.ceil(M / sm_count)
-    grid = (sm_count,)
-    with torch.cuda.device(x.device.index):
-        _layer_norm_bwd_kernel[grid](
-            x,
-            weight,
-            bias,
-            y,
-            dy,
-            dx,
-            _dw,
-            _db,
-            dresidual,
-            weight1,
-            dy1,
-            dx1,
-            _dw1,
-            _db1,
-            dresidual_in,
-            rowscale,
-            seeds,
-            mean,
-            rstd,
-            x.stride(0),
-            0 if not recompute_output else y.stride(0),
-            dy.stride(0),
-            dx.stride(0),
-            dresidual.stride(0) if dresidual is not None else 0,
-            dy1.stride(0) if dy1 is not None else 0,
-            dx1.stride(0) if dx1 is not None else 0,
-            dresidual_in.stride(0) if dresidual_in is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            rows_per_program,
-            is_rms_norm,
-            BLOCK_N,
-            dresidual is not None,
-            dresidual_in is not None,
-            bias is not None,
-            dropout_p > 0.0,
-        )
-    dw = _dw.sum(0).to(weight.dtype)
-    db = _db.sum(0).to(bias.dtype) if bias is not None else None
-    dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
-    db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return (
-        (dx, dw, db, dresidual_in, dx1, dw1, db1)
-        if not recompute_output
-        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
-    )
-class LayerNormFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        weight,
-        bias,
-        residual=None,
-        x1=None,
-        weight1=None,
-        bias1=None,
-        eps=1e-6,
-        dropout_p=0.0,
-        rowscale=None,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-        return_dropout_mask=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        if x1 is not None:
-            assert x1.shape == x_shape_og
-            assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = x1.reshape(-1, x1.shape[-1])
-            if x1.stride(-1) != 1:
-                x1 = x1.contiguous()
-        weight = weight.contiguous()
-        if bias is not None:
-            bias = bias.contiguous()
-        if weight1 is not None:
-            weight1 = weight1.contiguous()
-        if bias1 is not None:
-            bias1 = bias1.contiguous()
-        if rowscale is not None:
-            rowscale = rowscale.reshape(-1).contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
-            _layer_norm_fwd(
-                x,
-                weight,
-                bias,
-                eps,
-                residual,
-                x1,
-                weight1,
-                bias1,
-                dropout_p=dropout_p,
-                rowscale=rowscale,
-                residual_dtype=residual_dtype,
-                is_rms_norm=is_rms_norm,
-                return_dropout_mask=return_dropout_mask,
-            )
-        )
-        ctx.save_for_backward(
-            residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.dropout_p = dropout_p
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.has_x1 = x1 is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        y = y.reshape(x_shape_og)
-        y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = (
-            residual_out.reshape(x_shape_og) if residual_out is not None else None
-        )
-        dropout_mask = (
-            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        )
-        dropout_mask1 = (
-            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
-        )
-        if not return_dropout_mask:
-            if weight1 is None:
-                return y if not prenorm else (y, residual_out)
-            else:
-                return (y, y1) if not prenorm else (y, y1, residual_out)
-        else:
-            if weight1 is None:
-                return (
-                    (y, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, residual_out, dropout_mask, dropout_mask1)
-                )
-            else:
-                return (
-                    (y, y1, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, y1, residual_out, dropout_mask, dropout_mask1)
-                )
-    @staticmethod
-    def backward(ctx, dy, *args):
-        x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
-        dy = dy.reshape(-1, dy.shape[-1])
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if weight1 is not None:
-            dy1, args = args[0], args[1:]
-            dy1 = dy1.reshape(-1, dy1.shape[-1])
-            if dy1.stride(-1) != 1:
-                dy1 = dy1.contiguous()
-            assert dy1.shape == x.shape
-        else:
-            dy1 = None
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
-            dy,
-            x,
-            weight,
-            bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual,
-            dy1,
-            weight1,
-            bias1,
-            seeds,
-            ctx.dropout_p,
-            rowscale,
-            ctx.has_residual,
-            ctx.has_x1,
-            ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-        )
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dw,
-            db,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
-            dw1,
-            db1,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-        return_dropout_mask,
-    )
-def rms_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        True,
-        return_dropout_mask,
-    )
-class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if dropout_p > 0.0:
-            self.drop = torch.nn.Dropout(dropout_p)
-        else:
-            self.drop = None
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
-    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
-        return rms_norm_fn(
-            x,
-            self.weight,
-            self.bias,
-            residual=residual,
-            eps=self.eps,
-            dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
-            prenorm=prenorm,
-            residual_in_fp32=residual_in_fp32,
-        )
-class LayerNormLinearFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual=None,
-        eps=1e-6,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        norm_weight = norm_weight.contiguous()
-        if norm_bias is not None:
-            norm_bias = norm_bias.contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd(
-            x,
-            norm_weight,
-            norm_bias,
-            eps,
-            residual,
-            out_dtype=(
-                None
-                if not torch.is_autocast_enabled()
-                else torch.get_autocast_gpu_dtype()
-            ),
-            residual_dtype=residual_dtype,
-            is_rms_norm=is_rms_norm,
-        )
-        y = y.reshape(x_shape_og)
-        dtype = (
-            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
-        )
-        linear_weight = linear_weight.to(dtype)
-        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
-        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
-        # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(
-            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        ctx.linear_bias_is_none = linear_bias is None
-        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
-        dout = dout.reshape(-1, dout.shape[-1])
-        dy = F.linear(dout, linear_weight.t())
-        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = _layer_norm_bwd(
-            dy,
-            x,
-            norm_weight,
-            norm_bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual=dresidual,
-            has_residual=ctx.has_residual,
-            is_rms_norm=ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-            recompute_output=True,
-        )
-        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dnorm_weight,
-            dnorm_bias,
-            dlinear_weight,
-            dlinear_bias,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_linear_fn(
-    x,
-    norm_weight,
-    norm_bias,
-    linear_weight,
-    linear_bias,
-    residual=None,
-    eps=1e-6,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    return LayerNormLinearFn.apply(
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual,
-        eps,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-    )

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py DELETED Viewed

@@ -1,389 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or triton==2.2.0 or triton==2.3.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
-@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
-@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
-@triton.heuristics(
-    {
-        "HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"]
-        is not None
-    }
-)
-@triton.heuristics(
-    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
-)
-@triton.jit
-def _selective_scan_update_kernel(
-    # Pointers to matrices
-    state_ptr,
-    x_ptr,
-    dt_ptr,
-    dt_bias_ptr,
-    A_ptr,
-    B_ptr,
-    C_ptr,
-    D_ptr,
-    z_ptr,
-    out_ptr,
-    state_batch_indices_ptr,
-    # Matrix dimensions
-    batch,
-    nheads,
-    dim,
-    dstate,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_state_batch,
-    stride_state_head,
-    stride_state_dim,
-    stride_state_dstate,
-    stride_x_batch,
-    stride_x_head,
-    stride_x_dim,
-    stride_dt_batch,
-    stride_dt_head,
-    stride_dt_dim,
-    stride_dt_bias_head,
-    stride_dt_bias_dim,
-    stride_A_head,
-    stride_A_dim,
-    stride_A_dstate,
-    stride_B_batch,
-    stride_B_group,
-    stride_B_dstate,
-    stride_C_batch,
-    stride_C_group,
-    stride_C_dstate,
-    stride_D_head,
-    stride_D_dim,
-    stride_z_batch,
-    stride_z_head,
-    stride_z_dim,
-    stride_out_batch,
-    stride_out_head,
-    stride_out_dim,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    TIE_HDIM: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    HAS_D: tl.constexpr,
-    HAS_Z: tl.constexpr,
-    HAS_STATE_BATCH_INDICES: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_m = tl.program_id(axis=0)
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    if HAS_STATE_BATCH_INDICES:
-        state_batch_indices_ptr += pid_b
-        state_batch_idx = tl.load(state_batch_indices_ptr)
-        state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
-    else:
-        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
-    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
-    if HAS_DT_BIAS:
-        dt_bias_ptr += pid_h * stride_dt_bias_head
-    A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
-    if HAS_Z:
-        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
-    state_ptrs = state_ptr + (
-        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
-    )
-    x_ptrs = x_ptr + offs_m * stride_x_dim
-    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
-    if HAS_DT_BIAS:
-        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
-    if HAS_D:
-        D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (
-        offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
-    )
-    B_ptrs = B_ptr + offs_n * stride_B_dstate
-    C_ptrs = C_ptr + offs_n * stride_C_dstate
-    if HAS_D:
-        D_ptrs = D_ptr + offs_m * stride_D_dim
-    if HAS_Z:
-        z_ptrs = z_ptr + offs_m * stride_z_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    state = tl.load(
-        state_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-    )
-    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(
-            A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-        ).to(tl.float32)
-        dA = tl.exp(A * dt[:, None])
-    else:
-        dt = tl.load(dt_ptr).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptr).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(A_ptr).to(tl.float32)
-        dA = tl.exp(A * dt)  # scalar, not a matrix
-    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    if HAS_D:
-        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if HAS_Z:
-        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dB = B[None, :] * dt[:, None]
-    else:
-        dB = B * dt  # vector of size (dstate,)
-    state = state * dA + dB * x[:, None]
-    tl.store(
-        state_ptrs, state, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    )
-    out = tl.sum(state * C[None, :], axis=1)
-    if HAS_D:
-        out += x * D
-    if HAS_Z:
-        out *= z * tl.sigmoid(z)
-    tl.store(out_ptrs, out, mask=offs_m < dim)
-def selective_state_update(
-    state,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    state_batch_indices=None,
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    _, nheads, dim, dstate = state.shape
-    batch = x.shape[0]
-    if x.shape != (batch, nheads, dim):
-        print(f"{state.shape} {x.shape} {batch} {nheads} {dim}")
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-    if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch,)
-    out = torch.empty_like(x)
-    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
-    z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
-    # We don't want autotune since it will overwrite the state
-    # We instead tune by hand.
-    BLOCK_SIZE_M, num_warps = (
-        (32, 4)
-        if dstate <= 16
-        else (
-            (16, 4)
-            if dstate <= 32
-            else ((8, 4) if dstate <= 64 else ((4, 4) if dstate <= 128 else ((4, 8))))
-        )
-    )
-    tie_hdim = (
-        A.stride(-1) == 0
-        and A.stride(-2) == 0
-        and dt.stride(-1) == 0
-        and dt_bias.stride(-1) == 0
-    )
-    with torch.cuda.device(x.device.index):
-        _selective_scan_update_kernel[grid](
-            state,
-            x,
-            dt,
-            dt_bias,
-            A,
-            B,
-            C,
-            D,
-            z,
-            out,
-            state_batch_indices,
-            batch,
-            nheads,
-            dim,
-            dstate,
-            nheads // ngroups,
-            state.stride(0),
-            state.stride(1),
-            state.stride(2),
-            state.stride(3),
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
-            A.stride(0),
-            A.stride(1),
-            A.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            C.stride(0),
-            C.stride(1),
-            C.stride(2),
-            *(D.stride(0), D.stride(1)) if D is not None else 0,
-            z_strides[0],
-            z_strides[1],
-            z_strides[2],
-            out.stride(0),
-            out.stride(1),
-            out.stride(2),
-            dt_softplus,
-            tie_hdim,
-            BLOCK_SIZE_M,
-            num_warps=num_warps,
-        )
-    if not has_heads:
-        out = out.squeeze(1)
-    return out
-def selective_state_update_ref(
-    state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    batch, nheads, dim, dstate = state.shape
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-        dt = dt + dt_bias
-    dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(
-        rearrange(dt, "b h d -> b h d 1") * A
-    )  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
-        B, "b h n -> b h 1 n"
-    )  # (batch, nheads, dim, dstate)
-    state.copy_(
-        state * dA + dB * rearrange(x, "b h d -> b h d 1")
-    )  # (batch, dim, dstate
-    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
-    if D is not None:
-        out += (x * D).to(out.dtype)
-    out = (out if z is None else out * F.silu(z)).to(x.dtype)
-    if not has_heads:
-        out = out.squeeze(1)
-    return out

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py DELETED Viewed

The diff for this file is too large to render. See raw diff

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py DELETED Viewed

@@ -1,2012 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_SIZE_H": 1}),
-        triton.Config({"BLOCK_SIZE_H": 2}),
-        triton.Config({"BLOCK_SIZE_H": 4}),
-        triton.Config({"BLOCK_SIZE_H": 8}),
-        triton.Config({"BLOCK_SIZE_H": 16}),
-        triton.Config({"BLOCK_SIZE_H": 32}),
-        triton.Config({"BLOCK_SIZE_H": 64}),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_fwd_kernel(
-    # Pointers to matrices
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    dt_out_ptr,
-    dA_cumsum_ptr,
-    # Matrix dimension
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_dt_out_batch,
-    stride_dt_out_chunk,
-    stride_dt_out_head,
-    stride_dt_out_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    dt_out_ptrs = dt_out_ptr + (
-        offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize
-    )
-    dA_cs_ptrs = dA_cumsum_ptr + (
-        offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize
-    )
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt = tl.where(dt <= 20.0, softplus(dt), dt)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    tl.store(
-        dt_out_ptrs,
-        dt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    dA = dt * A[:, None]
-    dA_cs = tl.cumsum(dA, axis=1)
-    tl.store(
-        dA_cs_ptrs,
-        dA_cs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_H": 1}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 2}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 4}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 8}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 16}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 32}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 64}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_bwd_kernel(
-    # Pointers to matrices
-    ddA_ptr,
-    ddt_out_ptr,
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    ddt_ptr,
-    dA_ptr,
-    ddt_bias_ptr,
-    # Matrix dimensions
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_ddA_batch,
-    stride_ddA_chunk,
-    stride_ddA_head,
-    stride_ddA_csize,
-    stride_ddt_out_batch,
-    stride_ddt_out_chunk,
-    stride_ddt_out_head,
-    stride_ddt_out_csize,
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_ddt_batch,
-    stride_ddt_seqlen,
-    stride_ddt_head,
-    stride_dA_head,
-    stride_ddt_bias_head,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    ddt_out_ptr += pid_b * stride_ddt_out_batch + pid_c * stride_ddt_out_chunk
-    ddA_ptr += pid_b * stride_ddA_batch + pid_c * stride_ddA_chunk
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    ddt_ptr += pid_b * stride_ddt_batch + pid_c * chunk_size * stride_ddt_seqlen
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    ddt_out_ptrs = ddt_out_ptr + (
-        offs_h[:, None] * stride_ddt_out_head + offs_c[None, :] * stride_ddt_out_csize
-    )
-    ddA_ptrs = ddA_ptr + (
-        offs_h[:, None] * stride_ddA_head + offs_c[None, :] * stride_ddA_csize
-    )
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    ddt_ptrs = ddt_ptr + (
-        offs_h[:, None] * stride_ddt_head + offs_c[None, :] * stride_ddt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    ddA = tl.load(
-        ddA_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    ddt_out = tl.load(
-        ddt_out_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    ddt = ddA * A[:, None] + ddt_out
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt_presoftplus = dt
-        dt = tl.where(dt <= 20.0, softplus(dt), ddt)
-    clamp_mask = (dt < dt_min) | (dt > dt_max)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    ddt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), ddt, 0.0
-    )
-    ddt = tl.where(clamp_mask, 0.0, ddt)
-    if DT_SOFTPLUS:
-        ddt = tl.where(dt_presoftplus <= 20.0, ddt * tl.sigmoid(dt_presoftplus), ddt)
-    tl.store(
-        ddt_ptrs,
-        ddt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-    )
-    dA = tl.sum(ddA * dt, axis=1)
-    tl.atomic_add(dA_ptr + offs_h * stride_dA_head, dA, mask=offs_h < nheads)
-    if HAS_DT_BIAS:
-        ddt_bias = tl.sum(ddt, axis=1)
-        tl.atomic_add(
-            ddt_bias_ptr + offs_h * stride_ddt_bias_head, ddt_bias, mask=offs_h < nheads
-        )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_fwd_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    states_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_states_batch,
-    stride_states_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    if HAS_SEQ_IDX:
-        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    if HAS_SEQ_IDX:
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        if HAS_SEQ_IDX:
-            seq_idx_k = tl.load(
-                seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1
-            )
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        if not HAS_SEQ_IDX:
-            scale = tl.exp((dA_cs_last - dA_cs_k)) * dt_k
-        else:
-            scale = tl.where(
-                seq_idx_k == seq_idx_last, tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0
-            )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-        if HAS_SEQ_IDX:
-            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += (
-        pid_b * stride_states_batch
-        + pid_c * stride_states_chunk
-        + pid_h * stride_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    dx_ptr,
-    ddt_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-        tl.float32
-    )
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    acc *= tl.exp(dA_cs_last - dA_cs_m)[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    ddA_cs = -(ddt * dt_m)
-    ddA_cs_last = -tl.sum(ddA_cs)
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(ddA_cumsum_ptr + (chunk_size - 1) * stride_ddA_cs_csize, ddA_cs_last)
-    dx = (acc * dt_m[:, None]).to(dx_ptr.dtype.element_ty)
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "dstate", "hdim"],
-)
-@triton.jit
-def _chunk_state_bwd_db_kernel(
-    # Pointers to matrices
-    x_ptr,
-    dstates_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    db_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    dstate,
-    hdim,
-    batch,
-    seqlen,
-    nheads,
-    nheads_per_program,
-    ngroups,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_db_batch,
-    stride_db_seqlen,
-    stride_db_split,
-    stride_db_group,
-    stride_db_dstate,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_DDA_CS: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_sg = tl.program_id(axis=2)
-    pid_s = pid_sg // ngroups
-    pid_g = pid_sg - pid_s * ngroups
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_x_head
-    )
-    db_ptr += (
-        pid_b * stride_db_batch
-        + pid_c * chunk_size * stride_db_seqlen
-        + pid_g * stride_db_group
-        + pid_s * stride_db_split
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-        * stride_states_head
-    )
-    dt_ptr += (
-        pid_b * stride_dt_batch
-        + pid_c * stride_dt_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
-    )
-    if HAS_DDA_CS:
-        b_ptr += (
-            pid_b * stride_b_batch
-            + pid_c * chunk_size * stride_b_seqlen
-            + pid_g * stride_b_head
-        )
-        ddA_cumsum_ptr += (
-            pid_b * stride_ddA_cs_batch
-            + pid_c * stride_ddA_cs_chunk
-            + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-            * stride_ddA_cs_head
-        )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_k[None, :] * stride_x_hdim
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_dstate + offs_k[:, None] * stride_states_hdim
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    if HAS_DDA_CS:
-        b_ptrs = b_ptr + (
-            offs_m[:, None] * stride_b_seqlen + offs_n[None, :] * stride_b_dstate
-        )
-        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    if HAS_DDA_CS:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-    if HAS_SEQ_IDX:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    nheads_iter = min(
-        nheads_per_program, nheads // ngroups - pid_s * nheads_per_program
-    )
-    for h in range(nheads_iter):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = dstates.to(x_ptrs.dtype.element_ty)
-        db = tl.dot(x, dstates)
-        dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-            tl.float32
-        )
-        dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-            tl.float32
-        )
-        dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-        if not HAS_SEQ_IDX:
-            scale = tl.exp(dA_cs_last - dA_cs_m)
-        else:
-            scale = tl.where(
-                seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0
-            )
-        db *= (scale * dt_m)[:, None]
-        if HAS_DDA_CS:
-            # This is the gradient wrt (dA_cs_last - dA_cs_m), i.e. the exclusive reverse cumsum
-            ddA_cs = tl.sum(db * b, axis=1)
-            tl.atomic_add(
-                ddA_cumsum_ptrs + stride_ddA_cs_csize,
-                ddA_cs,
-                mask=offs_m < chunk_size - 1,
-            )
-        acc += db
-        x_ptrs += stride_x_head
-        dstates_ptrs += stride_states_head
-        dt_ptrs += stride_dt_head
-        dA_cumsum_ptr += stride_dA_cs_head
-        dA_cumsum_ptrs += stride_dA_cs_head
-        if HAS_DDA_CS:
-            ddA_cumsum_ptrs += stride_ddA_cs_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    # if HAS_SEQ_IDX:
-    #     seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-    #     seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-    #     acc = tl.where(seq_idx_m[:, None] == seq_idx_last, acc, 0.0)
-    db_ptrs = db_ptr + (
-        offs_m[:, None] * stride_db_seqlen + offs_n[None, :] * stride_db_dstate
-    )
-    tl.store(
-        db_ptrs,
-        acc,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-    )
-@triton.autotune(
-    configs=[
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_ddAcs_stable_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    acc *= scale[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    # ddA_cs = -(ddt * dt_m)
-    # Triton 2.2.0 errors if we have the cumsum here, so we just write it out
-    # then call torch.cumsum outside this kernel.
-    # ddA_cs = tl.cumsum(ddt * dt_m)
-    ddA_cs = ddt * dt_m
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    # tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(
-        ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_varlen_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    chunk_states_ptr,
-    cu_seqlens_ptr,
-    states_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_chunk_states_chunk,
-    stride_chunk_states_head,
-    stride_chunk_states_hdim,
-    stride_chunk_states_dstate,
-    stride_states_batch,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
-    pid_c = (end_idx - 1) // chunk_size
-    b_ptr += (
-        pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    chunk_states_ptr += (
-        pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(
-        dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
-    ).to(tl.float32)
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    chunk_size_limit = end_idx - pid_c * chunk_size
-    start_idx = tl.load(cu_seqlens_ptr + pid_b)
-    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim)
-            & (offs_k[None, :] < chunk_size_limit - k)
-            & (offs_k[None, :] >= start_idx_cur - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k)
-            & (offs_n[None, :] < dstate)
-            & (offs_k[:, None] >= start_idx_cur - k),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        scale = tl.where(
-            (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
-            tl.exp((dA_cs_last - dA_cs_k)) * dt_k,
-            0.0,
-        )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
-    if start_idx < pid_c * chunk_size:
-        chunk_states_ptrs = chunk_states_ptr + (
-            offs_m[:, None] * stride_chunk_states_hdim
-            + offs_n[None, :] * stride_chunk_states_dstate
-        )
-        chunk_states = tl.load(
-            chunk_states_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        # scale = tl.where(start_idx < pid_c * chunk_size, tl.exp(dA_cs_last), 0.0)
-        scale = tl.exp(dA_cs_last)
-        acc += chunk_states * scale
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-def _chunk_cumsum_fwd(
-    dt, A, chunk_size, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))
-):
-    batch, seqlen, nheads = dt.shape
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-    nchunks = math.ceil(seqlen / chunk_size)
-    dt_out = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    dA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
-            dt,
-            A,
-            dt_bias,
-            dt_out,
-            dA_cumsum,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            dt_out.stride(0),
-            dt_out.stride(2),
-            dt_out.stride(1),
-            dt_out.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return dA_cumsum, dt_out
-def _chunk_cumsum_bwd(
-    ddA,
-    ddt_out,
-    dt,
-    A,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    ddt=None,
-):
-    batch, seqlen, nheads = dt.shape
-    _, _, nchunks, chunk_size = ddA.shape
-    assert ddA.shape == (batch, nheads, nchunks, chunk_size)
-    assert ddt_out.shape == (batch, nheads, nchunks, chunk_size)
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-        ddt_bias = torch.empty_like(dt_bias, dtype=torch.float32)
-    else:
-        ddt_bias = None
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-    else:
-        ddt = torch.empty_like(dt)
-    dA = torch.empty_like(A, dtype=torch.float32)
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_bwd_kernel[grid_chunk_cs](
-            ddA,
-            ddt_out,
-            dt,
-            A,
-            dt_bias,
-            ddt,
-            dA,
-            ddt_bias,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            ddA.stride(0),
-            ddA.stride(2),
-            ddA.stride(1),
-            ddA.stride(3),
-            ddt_out.stride(0),
-            ddt_out.stride(2),
-            ddt_out.stride(1),
-            ddt_out.stride(3),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            ddt.stride(0),
-            ddt.stride(1),
-            ddt.stride(2),
-            dA.stride(0),
-            ddt_bias.stride(0) if ddt_bias is not None else 0,
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return ddt, dA, ddt_bias
-def _chunk_state_fwd(
-    B, x, dt, dA_cumsum, seq_idx=None, states=None, states_in_fp32=True
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if states is not None:
-        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
-    else:
-        states_dtype = torch.float32 if states_in_fp32 else B.dtype
-        states = torch.empty(
-            (batch, nchunks, nheads, headdim, dstate),
-            device=x.device,
-            dtype=states_dtype,
-        )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_fwd_kernel[grid](
-            x,
-            B,
-            states,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            headdim,
-            dstate,
-            chunk_size,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-            states.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    return states
-def _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates, dx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if dx is not None:
-        assert dx.shape == x.shape
-    else:
-        dx = torch.empty_like(x)
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dA_cumsum.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            dx,
-            ddt,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    return dx, ddt.to(dt.dtype), ddA_cumsum.to(dA_cumsum.dtype)
-def _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=None, B=None, ngroups=1):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    dstate = dstates.shape[-1]
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B is not None:
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        B_strides = (B.stride(0), B.stride(1), B.stride(2), B.stride(3))
-        # Use torch.empty since the Triton kernel will call init_to_zero
-        ddA_cumsum = torch.empty(
-            batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-        )
-        ddA_cumsum_strides = (
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-        )
-    else:
-        B_strides = (0, 0, 0, 0)
-        ddA_cumsum = None
-        ddA_cumsum_strides = (0, 0, 0, 0)
-    nheads_ngroups_ratio = nheads // ngroups
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    nheads_per_program = max(
-        min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1
-    )
-    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
-    dB = torch.empty(
-        batch, seqlen, nsplits, ngroups, dstate, device=x.device, dtype=torch.float32
-    )
-    grid_db = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nsplits * ngroups,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_db_kernel[grid_db](
-            x,
-            dstates,
-            B,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            dB,
-            ddA_cumsum,
-            chunk_size,
-            dstate,
-            headdim,
-            batch,
-            seqlen,
-            nheads,
-            nheads_per_program,
-            ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            *B_strides,
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            dB.stride(0),
-            dB.stride(1),
-            dB.stride(2),
-            dB.stride(3),
-            dB.stride(4),
-            *ddA_cumsum_strides,
-            HAS_DDA_CS=ddA_cumsum is not None,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
-        )
-    dB = dB.sum(2)
-    if ddA_cumsum is not None:
-        # The first element of ddA_cumsum is always zero, since that dA_cumsum does not contribute
-        # to the state of the chunk.
-        # torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-        # But it's easier to just do the cumsum for all elements, the result will be the same.
-        torch.cumsum(ddA_cumsum, dim=-1, out=ddA_cumsum)
-    return dB if B is None else (dB, ddA_cumsum)
-def _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    # Use torch.empty since the Triton kernel will call init_to_zero
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-    )
-    grid_ddtcs = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_ddAcs_stable_kernel[grid_ddtcs](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_M=max(triton.next_power_of_2(chunk_size), 16),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-    return ddA_cumsum
-def chunk_state_varlen(B, x, dt, dA_cumsum, cu_seqlens, chunk_states):
-    total_seqlen, nheads, headdim = x.shape
-    _, nchunks, chunk_size = dt.shape
-    _, ngroups, dstate = B.shape
-    batch = cu_seqlens.shape[0] - 1
-    cu_seqlens = cu_seqlens.contiguous()
-    assert nheads % ngroups == 0
-    assert B.shape == (total_seqlen, ngroups, dstate)
-    assert dt.shape == (nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
-    states = torch.empty(
-        batch,
-        nheads,
-        headdim,
-        dstate,
-        dtype=chunk_states.dtype,
-        device=chunk_states.device,
-    )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_varlen_kernel[grid](
-            x,
-            B,
-            dt,
-            dA_cumsum,
-            chunk_states,
-            cu_seqlens,
-            states,
-            headdim,
-            dstate,
-            chunk_size,
-            total_seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            dt.stride(1),
-            dt.stride(0),
-            dt.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            chunk_states.stride(0),
-            chunk_states.stride(1),
-            chunk_states.stride(2),
-            chunk_states.stride(3),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-        )
-    return states
-class ChunkStateFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, B, x, dt, dA_cumsum, states_in_fp32=True):
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        assert seqlen <= nchunks * chunk_size
-        _, _, ngroups, dstate = B.shape
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        assert dt.shape == (batch, nheads, nchunks, chunk_size)
-        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if (
-            x.stride(-1) != 1 and x.stride(1) != 1
-        ):  # Either M or K dimension should be contiguous
-            x = x.contiguous()
-        states = _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=states_in_fp32)
-        ctx.save_for_backward(B, x, dt, dA_cumsum)
-        return states
-    @staticmethod
-    def backward(ctx, dstates):
-        B, x, dt, dA_cumsum = ctx.saved_tensors
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        _, _, ngroups, dstate = B.shape
-        assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-        if dstates.stride(-1) != 1:
-            dstates = dstates.contiguous()
-        dx, ddt, ddA_cumsum = _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates)
-        dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, ngroups=ngroups)
-        dB = dB.to(B.dtype)
-        return dB, dx, ddt, ddA_cumsum, None
-def chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    return ChunkStateFn.apply(B, x, dt, dA_cumsum, states_in_fp32)
-def chunk_state_ref(B, x, dt, dA_cumsum):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    # Check constraints.
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    _, _, nchunks, chunk_size = dt.shape
-    assert seqlen <= nchunks * chunk_size
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    ngroups = B.shape[2]
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
-    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-    if seqlen < nchunks * chunk_size:
-        x = F.pad(x, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-        B = F.pad(B, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-    x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
-    B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
-    decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum(
-        "bclhn,bhcl,bhcl,bclhp->bchpn",
-        B.to(x.dtype),
-        decay_states.to(x.dtype),
-        dt.to(x.dtype),
-        x,
-    )

build/torch25-cxx11-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py DELETED Viewed

@@ -1,1884 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-from typing import Optional
-import math
-from packaging import version
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_cuda = None, None
-from .ssd_bmm import _bmm_chunk_fwd, _bmm_chunk_bwd
-from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_cumsum_bwd
-from .ssd_chunk_state import _chunk_state_fwd, _chunk_state_bwd_db
-from .ssd_chunk_state import _chunk_state_bwd_ddAcs_stable
-from .ssd_chunk_state import chunk_state, chunk_state_ref
-from .ssd_chunk_state import chunk_state_varlen
-from .ssd_state_passing import _state_passing_fwd, _state_passing_bwd
-from .ssd_state_passing import state_passing, state_passing_ref
-from .ssd_chunk_scan import _chunk_scan_fwd, _chunk_scan_bwd_dz, _chunk_scan_bwd_dstates
-from .ssd_chunk_scan import _chunk_scan_bwd_dC, _chunk_scan_bwd_dcb
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_stable
-from .ssd_chunk_scan import chunk_scan, chunk_scan_ref
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_prev
-from .layernorm_gated import rmsnorm_fn, _layer_norm_fwd, _layer_norm_bwd
-from .k_activations import _swiglu_fwd, _swiglu_bwd
-TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_scan_chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    cb_ptr,
-    dout_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    D_ptr,
-    b_ptr,
-    dstates_ptr,
-    dx_ptr,
-    ddt_ptr,
-    dD_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_cb_batch,
-    stride_cb_chunk,
-    stride_cb_head,
-    stride_cb_csize_m,
-    stride_cb_csize_k,
-    stride_dout_batch,
-    stride_dout_seqlen,
-    stride_dout_head,
-    stride_dout_hdim,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_D_head,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_dstates_head,
-    stride_dstates_hdim,
-    stride_dstates_dstate,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_dD_batch,
-    stride_dD_chunk,
-    stride_dD_head,
-    stride_dD_csize,
-    stride_dD_hdim,
-    # Meta-parameters
-    HAS_D: tl.constexpr,
-    D_HAS_HDIM: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-    IS_TRITON_22: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    cb_ptr += (
-        pid_b * stride_cb_batch
-        + pid_c * stride_cb_chunk
-        + (pid_h // nheads_ngroups_ratio) * stride_cb_head
-    )
-    dout_ptr += (
-        pid_b * stride_dout_batch
-        + pid_c * chunk_size * stride_dout_seqlen
-        + pid_h * stride_dout_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_dstates_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize,
-        mask=offs_m < chunk_size_limit,
-        other=0.0,
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    # Might be faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    # However, we're getting error with the Triton compiler 2.1.0 for that code path:
-    # Unexpected mma -> mma layout conversion
-    # Triton 2.2.0 fixes this
-    offs_dstate = tl.arange(
-        0,
-        (
-            BLOCK_SIZE_DSTATE
-            if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128
-            else BLOCK_SIZE_K
-        ),
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_dstate[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_dstates_hdim
-        + offs_dstate[:, None] * stride_dstates_dstate
-    )
-    if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_dstate[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates) * scale[:, None]
-    else:
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_dstate[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_dstate[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_dstates_dstate
-        acc *= scale[:, None]
-    # x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    # x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    # dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    # dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    # ddt = tl.sum(acc * x, axis=1) * dt_m
-    # ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    # tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    cb_ptrs = cb_ptr + (
-        offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k
-    )
-    dout_ptrs = dout_ptr + (
-        offs_k[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    K_MAX = chunk_size_limit
-    K_MIN = pid_m * BLOCK_SIZE_M
-    cb_ptrs += K_MIN * stride_cb_csize_k
-    dout_ptrs += K_MIN * stride_dout_seqlen
-    dA_cumsum_ptrs += K_MIN * stride_dA_cs_csize
-    for k in range(K_MIN, K_MAX, BLOCK_SIZE_K):
-        k = tl.multiple_of(k, BLOCK_SIZE_K)
-        # For some reason setting mask to (offs_m[:, None] < chunk_size_limit) is much slower
-        cb = tl.load(
-            cb_ptrs,
-            mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < K_MAX - k),
-            other=0.0,
-        )
-        dout = tl.load(
-            dout_ptrs,
-            mask=(offs_k[:, None] < K_MAX - k) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < K_MAX - k, other=0.0).to(
-            tl.float32
-        )
-        cb *= tl.exp(dA_cs_k[None, :] - dA_cs_m[:, None])
-        # If we don't have the (k + offs_k[None, :] < K_MAX) mask, for indices outside this range,
-        # we might have dA_cs_m = 0.0 and dA_cs_k very negative, and tl.exp will return inf.
-        # Multiplying with cb, which is 0.0 outside the range, will make the result NaN.
-        # This will cause NaN in acc, and hence NaN in dx and ddt.
-        mask = (k + offs_k[None, :] >= offs_m[:, None]) & (k + offs_k[None, :] < K_MAX)
-        cb = tl.where(mask, cb, 0.0)
-        cb = cb.to(dout_ptr.dtype.element_ty)
-        acc += tl.dot(cb, dout)
-        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
-        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    dx = acc * dt_m[:, None]
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    if HAS_D:
-        dout_res_ptrs = dout_ptr + (
-            offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-        )
-        dout_res = tl.load(
-            dout_res_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-            other=0.0,
-        ).to(tl.float32)
-        if D_HAS_HDIM:
-            D = tl.load(
-                D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0
-            ).to(tl.float32)
-        else:
-            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
-        dx += dout_res * D
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_D:
-        dD_ptr += (
-            pid_b * stride_dD_batch
-            + pid_c * stride_dD_chunk
-            + pid_h * stride_dD_head
-            + pid_m * stride_dD_csize
-        )
-        if D_HAS_HDIM:
-            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
-            dD = tl.sum(dout_res * x, axis=0)
-            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
-        else:
-            dD = tl.sum(dout_res * x)
-            tl.store(dD_ptr, dD)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-def _chunk_scan_chunk_state_bwd_dx(
-    x, dt, dA_cumsum, B, CB, dout, dstates, D=None, seq_idx=None, dx=None
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert CB.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dout.shape == x.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-        assert D.stride(-1) == 1
-        BLOCK_SIZE_min = 32
-        dD = torch.empty(
-            triton.cdiv(chunk_size, BLOCK_SIZE_min),
-            batch,
-            nchunks,
-            nheads,
-            headdim if D.dim() == 2 else 1,
-            device=D.device,
-            dtype=torch.float32,
-        )
-    else:
-        dD = None
-    dD_strides = (
-        (dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
-        if D is not None
-        else (0, 0, 0, 0, 0)
-    )
-    if dx is None:
-        dx = torch.empty_like(x)
-    else:
-        assert dx.shape == x.shape
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_scan_chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            CB,
-            dout,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            D,
-            B,
-            dstates,
-            dx,
-            ddt,
-            dD,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            CB.stride(0),
-            CB.stride(1),
-            CB.stride(2),
-            CB.stride(-1),
-            CB.stride(-2),
-            dout.stride(0),
-            dout.stride(1),
-            dout.stride(2),
-            dout.stride(3),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            D.stride(0) if D is not None else 0,
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            dD_strides[1],
-            dD_strides[2],
-            dD_strides[3],
-            dD_strides[0],
-            dD_strides[4],
-            D is not None,
-            D.dim() == 2 if D is not None else True,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-            IS_TRITON_22=TRITON_22
-        )
-    if D is not None:
-        BLOCK_SIZE_actual = _chunk_scan_chunk_state_bwd_dx_kernel.best_config.kwargs[
-            "BLOCK_SIZE_M"
-        ]
-        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
-        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
-        if D.dim() == 1:
-            dD = rearrange(dD, "h 1 -> h")
-    return dx, ddt.to(dtype=dt.dtype), dD
-def _mamba_chunk_scan_combined_fwd(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert C.shape == B.shape
-    if z is not None:
-        assert z.shape == x.shape
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if (
-        x.stride(-1) != 1 and x.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        x = x.contiguous()
-    if (
-        z is not None and z.stride(-1) != 1 and z.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        z = z.contiguous()
-    if D is not None and D.stride(-1) != 1:
-        D = D.contiguous()
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    # # (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, nheads, chunk_size, chunk_size)
-    # dA_cumsum_tmp0, dt_tmp0 = _chunk_cumsum_fwd(dt[:, :147], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp1, dt_tmp1 = _chunk_cumsum_fwd(dt[:, 147:], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp2, dt_tmp2 = _chunk_cumsum_fwd(dt[:, 147:256], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit
-    )
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    # states_tmp0 = _chunk_state_fwd(B[:, :147], x[:, :147], dt_tmp0, dA_cumsum_tmp0, states_in_fp32=True)
-    # states_tmp1 = _chunk_state_fwd(B[:, 147:], x[:, 147:], dt_tmp1, dA_cumsum_tmp1, states_in_fp32=True)
-    # states_tmp2 = _chunk_state_fwd(B[:, 147:256], x[:, 147:256], dt_tmp2, dA_cumsum_tmp2, states_in_fp32=True)
-    states, final_states = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-        out_dtype=C.dtype,
-    )
-    states, final_states = [
-        rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]
-    ]
-    # states_tmp0 = rearrange(_state_passing_fwd(rearrange(states_tmp0, "... p n -> ... (p n)"), dA_cumsum_tmp0[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    # states_tmp1 = rearrange(_state_passing_fwd(rearrange(states_tmp1, "... p n -> ... (p n)"), dA_cumsum_tmp1[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    out, out_x = _chunk_scan_fwd(
-        CB, x, dt, dA_cumsum, C, states, D=D, z=z, seq_idx=seq_idx
-    )
-    if cu_seqlens is None:
-        return out, out_x, dt, dA_cumsum, states, final_states
-    else:
-        assert (
-            batch == 1
-        ), "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
-        varlen_states = chunk_state_varlen(
-            B.squeeze(0),
-            x.squeeze(0),
-            dt.squeeze(0),
-            dA_cumsum.squeeze(0),
-            cu_seqlens,
-            states.squeeze(0),
-        )
-        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
-def _mamba_chunk_scan_combined_bwd(
-    dout,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    out,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    dfinal_states=None,
-    seq_idx=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    dx=None,
-    ddt=None,
-    dB=None,
-    dC=None,
-    dz=None,
-    recompute_output=False,
-):
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    batch, seqlen, nheads, headdim = x.shape
-    nchunks = math.ceil(seqlen / chunk_size)
-    _, _, ngroups, dstate = B.shape
-    assert dout.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert C.shape == B.shape
-    assert out.shape == x.shape
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if dx is not None:
-        assert dx.shape == x.shape
-    if dB is not None:
-        assert dB.shape == B.shape
-        dB_given = dB
-    else:
-        dB_given = torch.empty_like(B)
-    if dC is not None:
-        assert dC.shape == C.shape
-        dC_given = dC
-    else:
-        dC_given = torch.empty_like(C)
-    if dz is not None:
-        assert z is not None
-        assert dz.shape == z.shape
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-        ddt_given = ddt
-    else:
-        ddt_given = torch.empty_like(dt)
-    # TD: For some reason Triton (2.1.0 and 2.2.0) errors with
-    # "[CUDA]: invalid device context" (e.g. during varlne test), and cloning makes it work. Idk why.
-    dt_in = dt.clone()
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt_in,
-        A,
-        chunk_size,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    states, _ = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-    )
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    if z is not None:
-        dz, dout, dD, *rest = _chunk_scan_bwd_dz(
-            x,
-            z,
-            out,
-            dout,
-            chunk_size=chunk_size,
-            has_ddAcs=False,
-            D=D,
-            dz=dz,
-            recompute_output=recompute_output,
-        )
-        outz = rest[0] if recompute_output else out
-    else:
-        dz = None
-        outz = out
-    dstates = _chunk_scan_bwd_dstates(
-        C, dA_cumsum, dout, seq_idx=seq_idx, dtype=states.dtype
-    )
-    # dstates has length nchunks, containing the gradient to initial states at index 0 and
-    # gradient to the states of chunk (nchunks - 2) at index (nchunks - 1)
-    # Do computation in fp32 but convert dstates and states to fp16/bf16 since dstates and states
-    # will be used in matmul in the next kernels.
-    dstates, ddA_chunk_cumsum, dinitial_states, states = _state_passing_bwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        rearrange(dstates, "... p n -> ... (p n)"),
-        dfinal_states=(
-            rearrange(dfinal_states, "... p n -> ... (p n)")
-            if dfinal_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        has_initial_states=initial_states is not None,
-        dstates_dtype=x.dtype,
-        states_dtype=x.dtype,
-        chunk_size=chunk_size,
-    )
-    # dstates has length nchunks, containing the gradient to states of chunk 0 at index 0 and
-    # gradient to the final states at index (nchunks - 1)
-    # states has length nchunks, containing the initial states at index 0 and the state for chunk (nchunks - 2) at index (nchunks - 1)
-    # The final states is not stored.
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    dstates = rearrange(dstates, "... (p n) -> ... p n", n=dstate)
-    dinitial_states = (
-        rearrange(dinitial_states, "... (p n) -> ... p n", n=dstate)
-        if dinitial_states is not None
-        else None
-    )
-    dx, ddt, dD_from_x = _chunk_scan_chunk_state_bwd_dx(
-        x, dt, dA_cumsum, B, CB, dout, dstates, D=D, seq_idx=seq_idx, dx=dx
-    )
-    # dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, ngroups=ngroups)
-    dB, ddA_next = _chunk_state_bwd_db(
-        x, dt, dA_cumsum, dstates, seq_idx=seq_idx, B=B, ngroups=ngroups
-    )
-    # dC = _chunk_scan_bwd_dC(states[:, :-1].to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    dC, ddA_cumsum_prev = _chunk_scan_bwd_dC(
-        states.to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, C=C, ngroups=ngroups
-    )
-    # Computing ddA with the dcb kernel is much slower, so we're not using it for now
-    dCB = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    # dCB, ddA_tmp = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, CB=CB, ngroups=ngroups)
-    dCB = dCB.to(CB.dtype)
-    _bmm_chunk_bwd(C, dCB, residual=dB, out=dB_given)
-    _bmm_chunk_bwd(B, rearrange(dCB, "... l s -> ... s l"), residual=dC, out=dC_given)
-    # If we have z, then dout_x is recomputed in fp32 so dD = (dout_x * x).sum() is more accurate
-    # than dD_from_x = (dout_x * x).sum() where dout_x is in fp16/bf16
-    if z is None:
-        dD = dD_from_x
-    # Formula for ddA_cumsum, assuming out is the output of the forward pass before adding x * D.
-    # ddA_cumsum = torch.einsum("bclhp,bclhp->bhcl", out.float(), dout.float()) - ddt * dt
-    # However, this is numerically unstable: when we do the reverse cumsum on ddA_cumsum, there might
-    # be a lot of underflow.
-    # This is already done as part of bwd_dC kernel
-    # ddA_cumsum_prev = _chunk_scan_bwd_ddAcs_prev(states[:, :-1], C, dout, dA_cumsum, seq_idx=seq_idx)
-    ddA_cumsum_prev[..., -1] += ddA_chunk_cumsum
-    ddA_prev = ddA_cumsum_prev.flip([-1]).cumsum(dim=-1).flip([-1])
-    # This is already done as part of bwd_dB kernel
-    # ddA_next = _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=seq_idx)
-    # We don't need to pass in seq_idx because CB also zeros out entries where seq_idx[i] != seq_idx[j]
-    ddA = _chunk_scan_bwd_ddAcs_stable(x, dt, dA_cumsum, dout, CB)
-    ddA += ddA_next + ddA_prev
-    ddt_given, dA, ddt_bias = _chunk_cumsum_bwd(
-        ddA,
-        ddt,
-        dt_in,
-        A,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-        ddt=ddt_given,
-    )
-    # These 2 lines are just to test ddt and dA being computed by old code
-    # _, dA = selective_scan_bwd(dout, x, dt, A, B, C, D=D.float(), z=z)
-    # ddt_given.copy_(ddt)
-    return_vals = (
-        dx,
-        ddt_given,
-        dA,
-        dB_given,
-        dC_given,
-        dD,
-        dz,
-        ddt_bias,
-        dinitial_states,
-    )
-    return return_vals if not recompute_output else (*return_vals, outz)
-def selective_scan_bwd(dout, x, dt, A, B, C, D=None, z=None):
-    """
-    Argument:
-        dout: (batch, seqlen, nheads, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size) or (batch, nheads, headdim, nchunks, chunk_size)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    import selective_scan
-    batch, seqlen, nheads, headdim = x.shape
-    chunk_size = dt.shape[-1]
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    x = rearrange(x, "b l h p -> b (h p) l")
-    squeeze_dt = dt.dim() == 4
-    if dt.dim() == 4:
-        dt = repeat(dt, "b h c l -> b h p c l", p=headdim)
-    dt = rearrange(dt, "b h p c l -> b (h p) (c l)", p=headdim)
-    squeeze_A = A.dim() == 1
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    if dt.stride(-1) != 1:
-        dt = dt.contiguous()
-    if D is not None:
-        D = D.contiguous()
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if z is not None and z.stride(-1) != 1:
-        z = z.contiguous()
-    _, intermediate, *rest = selective_scan.fwd(
-        x, dt.to(dtype=x.dtype), A, B, C, D, z, None, False
-    )
-    if z is not None:
-        out = rest[0]
-    else:
-        out = None
-    dout = rearrange(dout, "b l h p -> b (h p) l")
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-    # backward of selective_scan with the backward of chunk).
-    # Here we just pass in None and dz will be allocated in the C++ code.
-    _, ddt, dA, *rest = selective_scan.bwd(
-        x,
-        dt.to(dtype=x.dtype),
-        A,
-        B,
-        C,
-        D,
-        z,
-        None,
-        dout,
-        intermediate,
-        out,
-        None,
-        False,
-        False,  # option to recompute out_z, not used here
-    )
-    ddt = rearrange(ddt, "b (h p) (c l) -> b h p c l", p=headdim, l=chunk_size)
-    if squeeze_dt:
-        ddt = ddt.float().sum(dim=2)
-    if squeeze_A:
-        dA = rearrange(dA, "(h p) n -> h p n", p=headdim).sum(dim=(1, 2))
-    return ddt, dA
-class MambaChunkScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D=None,
-        z=None,
-        dt_bias=None,
-        initial_states=None,
-        seq_idx=None,
-        cu_seqlens=None,
-        dt_softplus=False,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        return_varlen_states=False,
-    ):
-        ctx.dt_dtype = dt.dtype
-        if not return_varlen_states:
-            cu_seqlens = None
-        else:
-            assert (
-                cu_seqlens is not None
-            ), "cu_seqlens must be provided if return_varlen_states is True"
-        out, out_x, dt_out, dA_cumsum, states, final_states, *rest = (
-            _mamba_chunk_scan_combined_fwd(
-                x,
-                dt,
-                A,
-                B,
-                C,
-                chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                seq_idx=seq_idx,
-                cu_seqlens=cu_seqlens,
-                dt_softplus=dt_softplus,
-                dt_limit=dt_limit,
-            )
-        )
-        ctx.save_for_backward(
-            out if z is None else out_x,
-            x,
-            dt,
-            dA_cumsum,
-            A,
-            B,
-            C,
-            D,
-            z,
-            dt_bias,
-            initial_states,
-            seq_idx,
-        )
-        ctx.dt_softplus = dt_softplus
-        ctx.chunk_size = chunk_size
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.return_varlen_states = return_varlen_states
-        if not return_varlen_states:
-            return out if not return_final_states else (out, final_states)
-        else:
-            varlen_states = rest[0]
-            return (
-                (out, varlen_states)
-                if not return_final_states
-                else (out, final_states, varlen_states)
-            )
-    @staticmethod
-    def backward(ctx, dout, *args):
-        out, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx = (
-            ctx.saved_tensors
-        )
-        assert (
-            not ctx.return_varlen_states
-        ), "return_varlen_states is not supported in backward"
-        dfinal_states = args[0] if ctx.return_final_states else None
-        dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = (
-            _mamba_chunk_scan_combined_bwd(
-                dout,
-                x,
-                dt,
-                A,
-                B,
-                C,
-                out,
-                ctx.chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                dfinal_states=dfinal_states,
-                seq_idx=seq_idx,
-                dt_softplus=ctx.dt_softplus,
-                dt_limit=ctx.dt_limit,
-            )
-        )
-        return (
-            dx,
-            ddt,
-            dA,
-            dB,
-            dC,
-            None,
-            dD,
-            dz,
-            ddt_bias,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_chunk_scan_combined(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    return_varlen_states=False,
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        chunk_size: int
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen)
-        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
-        dt_softplus: Whether to apply softplus to dt
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    return MambaChunkScanCombinedFn.apply(
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D,
-        z,
-        dt_bias,
-        initial_states,
-        seq_idx,
-        cu_seqlens,
-        dt_softplus,
-        dt_limit,
-        return_final_states,
-        return_varlen_states,
-    )
-def mamba_chunk_scan(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    states = rearrange(
-        state_passing(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    # 3. Compute the output for each chunk
-    out = chunk_scan(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_chunk_scan_combined_ref(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state_ref(B, x, dt, dA_cumsum)
-    states_dtype = states.dtype
-    if states.dtype not in [torch.float32, torch.float64]:
-        states = states.to(torch.float32)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    # state_passing_ref is much less numerically stable
-    states = rearrange(
-        state_passing_ref(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    states = states.to(states_dtype)
-    # 3. Compute the output for each chunk
-    out = chunk_scan_ref(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_selective_scan(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,) or (nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    from ..selective_scan_interface import selective_scan_fn
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    x = rearrange(x, "b l h p -> b (h p) l")
-    if dt.dim() == 3:
-        dt = repeat(dt, "b l h -> b l h p", p=headdim)
-    dt = rearrange(dt, "b l h p -> b (h p) l")
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if dt_bias is not None:
-        if dt_bias.dim() == 1:
-            dt_bias = repeat(dt_bias, "h -> h p", p=headdim)
-        dt_bias = rearrange(dt_bias, "h p -> (h p)")
-    if dt_limit != (0.0, float("inf")):
-        if dt_bias is not None:
-            dt = dt + rearrange(dt_bias, "d -> d 1")
-        if dt_softplus:
-            dt = F.softplus(dt)
-        dt = dt.clamp(min=dt_limit[0], max=dt_limit[1]).to(x.dtype)
-        dt_bias = None
-        dt_softplus = None
-    out = selective_scan_fn(
-        x, dt, A, B, C, D=D, z=z, delta_bias=dt_bias, delta_softplus=dt_softplus
-    )
-    return rearrange(out, "b (h p) l -> b l h p", p=headdim)
-def mamba_conv1d_scan_ref(
-    xBC,
-    conv1d_weight,
-    conv1d_bias,
-    dt,
-    A,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    headdim=None,
-    ngroups=1,
-):
-    """
-    Argument:
-        xBC: (batch, seqlen, dim + 2 * ngroups * dstate) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, dim)
-        dt_bias: (nheads) or (nheads, headdim)
-        headdim: if D is 1D and z is None, headdim must be passed in
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    batch, seqlen, nheads = dt.shape[:3]
-    assert nheads % ngroups == 0
-    if z is not None:
-        dim = z.shape[-1]
-        assert dim % nheads == 0
-        headdim = dim // nheads
-    else:
-        if D.dim() == 1:
-            assert headdim is not None
-        else:
-            headdim = D.shape[1]
-        dim = nheads * headdim
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    dstate = (xBC.shape[-1] - dim) // ngroups // 2
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    return rearrange(out, "b s h p -> b s (h p)")
-class MambaSplitConv1dScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states=None,
-        seq_idx=None,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        activation="silu",
-        rmsnorm_weight=None,
-        rmsnorm_eps=1e-6,
-        outproj_weight=None,
-        outproj_bias=None,
-        headdim=None,
-        ngroups=1,
-        norm_before_gate=True,
-    ):
-        assert activation in [None, "silu", "swish"]
-        if D.dim() == 1:
-            assert headdim is not None
-            (nheads,) = D.shape
-        else:
-            nheads, headdim = D.shape
-        batch, seqlen, _ = zxbcdt.shape
-        dim = nheads * headdim
-        assert nheads % ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        assert zxbcdt.shape == (
-            batch,
-            seqlen,
-            2 * d_nonssm + 2 * dim + 2 * ngroups * dstate + nheads,
-        )
-        assert dt_bias.shape == (nheads,)
-        assert A.shape == (nheads,)
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + ngroups * dstate * 2, nheads], dim=-1
-        )
-        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ngroups * dstate, ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-        if rmsnorm_weight is None:
-            out, out_x, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            out = rearrange(out, "b s h p -> b s (h p)")
-            rstd = None
-            if d_nonssm > 0:
-                out = torch.cat([_swiglu_fwd(zx0), out], dim=-1)
-        else:
-            out_x, _, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            # reshape input data into 2D tensor
-            x_rms = rearrange(out_x, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            rmsnorm_weight = rmsnorm_weight.contiguous()
-            if d_nonssm == 0:
-                out = None
-            else:
-                out01 = torch.empty(
-                    (batch, seqlen, d_nonssm + dim),
-                    dtype=x_rms.dtype,
-                    device=x_rms.device,
-                )
-                out = rearrange(out01[..., d_nonssm:], "b s d -> (b s) d")
-                _swiglu_fwd(zx0, out=out01[..., :d_nonssm])
-            out, _, rstd = _layer_norm_fwd(
-                x_rms,
-                rmsnorm_weight,
-                None,
-                rmsnorm_eps,
-                z_rms,
-                out=out,
-                group_size=dim // ngroups,
-                norm_before_gate=norm_before_gate,
-                is_rms_norm=True,
-            )
-            if d_nonssm == 0:
-                out = rearrange(out, "(b s) d -> b s d", b=batch)
-            else:
-                out = out01
-        ctx.outproj_weight_dtype = (
-            outproj_weight.dtype if outproj_weight is not None else None
-        )
-        if outproj_weight is not None:
-            if torch.is_autocast_enabled():
-                dtype = torch.get_autocast_gpu_dtype()
-                out, outproj_weight = out.to(dtype), outproj_weight.to(dtype)
-                outproj_bias = (
-                    outproj_bias.to(dtype) if outproj_bias is not None else None
-                )
-            out = F.linear(out, outproj_weight, outproj_bias)
-        else:
-            assert outproj_bias is None
-        ctx.save_for_backward(
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out_x,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        )
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.activation = activation
-        ctx.rmsnorm_eps = rmsnorm_eps
-        ctx.norm_before_gate = norm_before_gate
-        ctx.chunk_size = chunk_size
-        ctx.headdim = headdim
-        ctx.ngroups = ngroups
-        return out if not return_final_states else (out, final_states)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        (
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        ) = ctx.saved_tensors
-        dfinal_states = args[0] if ctx.return_final_states else None
-        headdim = ctx.headdim
-        nheads = D.shape[0]
-        dim = nheads * headdim
-        assert nheads % ctx.ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ctx.ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ctx.ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        recompute_output = outproj_weight is not None
-        if recompute_output:
-            out_recompute = torch.empty(
-                *out.shape[:2], d_nonssm + dim, device=out.device, dtype=out.dtype
-            )
-            out0_recompute, out1_recompute = out_recompute.split(
-                [d_nonssm, dim], dim=-1
-            )
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        # Recompute x, B, C
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                ctx.activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ctx.ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dzxbcdt = torch.empty_like(zxbcdt)
-        dzx0, dz, dxBC_given, ddt_given = torch.split(
-            dzxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        dxBC = torch.empty_like(xBC)
-        dx, dB, dC = torch.split(
-            dxBC, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-        dx = rearrange(dx, "b l (h p) -> b l h p", h=nheads)
-        dB = rearrange(dB, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dC = rearrange(dC, "b l (g n) -> b l g n", g=ctx.ngroups)
-        if outproj_weight is not None:
-            dout_og = dout
-            dout = F.linear(dout, outproj_weight.t())
-        if d_nonssm > 0:
-            dout0, dout = dout.split([d_nonssm, dim], dim=-1)
-            _swiglu_bwd(zx0, dout0, dxy=dzx0, recompute_output=True, out=out0_recompute)
-        dout = rearrange(dout, "b s (h p) -> b s h p", p=headdim)
-        if rmsnorm_weight is None:
-            dz = rearrange(dz, "b l (h p) -> b l h p", h=nheads)
-            dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states, *rest = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                    dz=dz,
-                    recompute_output=recompute_output,
-                )
-            )
-            out_for_linear = (
-                rearrange(rest[0], "b s h p -> b s (h p)") if recompute_output else None
-            )
-            drmsnorm_weight = None
-        else:
-            batch = dout.shape[0]
-            dy_rms = rearrange(dout, "b s h p -> (b s) (h p)")
-            dz = rearrange(dz, "b l d -> (b l) d")
-            x_rms = rearrange(out, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            out1_recompute = (
-                rearrange(out1_recompute, "b s d -> (b s) d")
-                if recompute_output
-                else None
-            )
-            dout, drmsnorm_weight, _, dz, *rest = _layer_norm_bwd(
-                dy_rms,
-                x_rms,
-                rmsnorm_weight,
-                None,
-                ctx.rmsnorm_eps,
-                None,
-                rstd,
-                z_rms,
-                group_size=dim // ctx.ngroups,
-                norm_before_gate=ctx.norm_before_gate,
-                is_rms_norm=True,
-                recompute_output=recompute_output,
-                dz=dz,
-                out=out1_recompute if recompute_output else None,
-            )
-            out_for_linear = out_recompute if recompute_output else None
-            dout = rearrange(dout, "(b s) (h p) -> b s h p", b=batch, p=headdim)
-            dx, ddt, dA, dB, dC, dD, _, ddt_bias, dinitial_states = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                )
-            )
-        if outproj_weight is not None:
-            doutproj_weight = torch.einsum("bso,bsd->od", dout_og, out_for_linear)
-            doutproj_bias = (
-                dout_og.sum(dim=(0, 1)) if outproj_bias is not None else None
-            )
-        else:
-            doutproj_weight, doutproj_bias = None, None
-        dxBC_given = rearrange(dxBC_given, "b s d -> b d s")
-        dxBC_given, dweight, dbias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            rearrange(dxBC, "b s d -> b d s"),
-            seq_idx,
-            None,
-            None,
-            dxBC_given,
-            False,
-            ctx.activation in ["silu", "swish"],
-        )
-        dxBC_given = rearrange(dxBC_given, "b d s -> b s d")
-        return (
-            dzxbcdt,
-            dweight,
-            dbias,
-            ddt_bias,
-            dA,
-            dD,
-            None,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            drmsnorm_weight,
-            None,
-            doutproj_weight,
-            doutproj_bias,
-            None,
-            None,
-            None,
-        )
-def mamba_split_conv1d_scan_combined(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    initial_states=None,
-    seq_idx=None,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen), int32
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    return MambaSplitConv1dScanCombinedFn.apply(
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states,
-        seq_idx,
-        dt_limit,
-        return_final_states,
-        activation,
-        rmsnorm_weight,
-        rmsnorm_eps,
-        outproj_weight,
-        outproj_bias,
-        headdim,
-        ngroups,
-        norm_before_gate,
-    )
-def mamba_split_conv1d_scan_ref(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    if D.dim() == 1:
-        assert headdim is not None
-        (nheads,) = D.shape
-    else:
-        nheads, headdim = D.shape
-    assert nheads % ngroups == 0
-    batch, seqlen, _ = zxbcdt.shape
-    dim = nheads * headdim
-    dstate = (zxbcdt.shape[-1] - 2 * dim - nheads) // ngroups // 2
-    assert zxbcdt.shape == (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads)
-    assert dt_bias.shape == (nheads,)
-    assert A.shape == (nheads,)
-    if rmsnorm_weight is not None:
-        assert rmsnorm_weight.shape == (dim,)
-    z, xBC, dt = torch.split(zxbcdt, [dim, dim + 2 * ngroups * dstate, nheads], dim=-1)
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z if rmsnorm_weight is None else None,
-        dt_bias=dt_bias,
-        dt_softplus=True,
-        dt_limit=dt_limit,
-    )
-    out = rearrange(out, "b s h p -> b s (h p)")
-    if rmsnorm_weight is not None:
-        out = rmsnorm_fn(
-            out,
-            rmsnorm_weight,
-            None,
-            z=rearrange(z, "b l h p -> b l (h p)"),
-            eps=rmsnorm_eps,
-            norm_before_gate=norm_before_gate,
-        )
-    if outproj_weight is not None:
-        out = F.linear(out, outproj_weight, outproj_bias)
-    return out

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-__version__ = "2.2.4"
-from .ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-from .modules.mamba_simple import Mamba
-from .modules.mamba2 import Mamba2
-from .models.mixer_seq_simple import MambaLMHeadModel
-__all__ = [
-    "selective_scan_fn",
-    "mamba_inner_fn",
-    "Mamba",
-    "Mamba2",
-    "MambaLMHeadModel",
-]

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py DELETED Viewed

@@ -1,326 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from ..utils.torch import custom_bwd, custom_fwd
-from einops import rearrange
-from ..distributed.distributed_utils import (
-    all_gather_raw,
-    all_reduce,
-    all_reduce_raw,
-    reduce_scatter,
-    reduce_scatter_raw,
-)
-class ParallelLinearFunc(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
-        """
-        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
-        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
-        """
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.process_group = process_group
-        ctx.sequence_parallel = sequence_parallel
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        if process_group is not None and sequence_parallel:
-            # We want to kick off the all_gather early, before weight dtype conversion
-            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-        else:
-            total_x = x
-        if torch.is_autocast_enabled():
-            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
-            bias = (
-                bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if bias is not None
-                else None
-            )
-        weight = weight.contiguous()
-        if process_group is not None and sequence_parallel:
-            handle_x.wait()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        output = F.linear(total_x, weight, bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        grad_output = grad_output.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-            else:
-                total_x = x
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            grad_input = F.linear(grad_output, weight.t())
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(
-                    grad_input, process_group, async_op=True
-                )
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            if process_group is not None and sequence_parallel:
-                handle_x.wait()
-            grad_weight = torch.einsum(
-                "bo,bi->oi", grad_output, total_x.reshape(batch_dim, total_x.shape[-1])
-            )
-        else:
-            grad_weight = None
-        grad_bias = grad_output.sum(dim=0) if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None
-def parallel_linear_func(
-    x: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    process_group: Optional[ProcessGroup] = None,
-    sequence_parallel: bool = True,
-):
-    return ParallelLinearFunc.apply(x, weight, bias, process_group, sequence_parallel)
-class ColumnParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        if out_features % multiple_of:
-            raise ValueError(
-                f"out_features ({out_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = out_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        super().__init__(
-            in_features,
-            local_multiple * multiple_of,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        return parallel_linear_func(
-            x,
-            self.weight,
-            self.bias,
-            process_group=self.process_group,
-            sequence_parallel=self.sequence_parallel,
-        )
-class RowParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        rank = torch.distributed.get_rank(process_group)
-        if in_features % multiple_of:
-            raise ValueError(
-                f"in_features ({in_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = in_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        # Only rank 0 will have bias
-        super().__init__(
-            local_multiple * multiple_of,
-            out_features,
-            bias=bias and rank == 0,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        """
-        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
-        a reduce_scatter of the result.
-        """
-        out = parallel_linear_func(x, self.weight, self.bias)
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return reduce_fn(out, self.process_group)
-class VocabParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if num_embeddings % world_size != 0:
-                raise ValueError(
-                    f"num_embeddings ({num_embeddings}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-            if world_size > 1 and padding_idx is not None:
-                raise RuntimeError("ParallelEmbedding does not support padding_idx")
-        else:
-            world_size = 1
-        super().__init__(
-            num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs
-        )
-    def forward(self, input: Tensor) -> Tensor:
-        if self.process_group is None:
-            return super().forward(input)
-        else:
-            rank = torch.distributed.get_rank(self.process_group)
-            vocab_size = self.num_embeddings
-            vocab_start_index, vocab_end_index = (
-                rank * vocab_size,
-                (rank + 1) * vocab_size,
-            )
-            # Create a mask of valid vocab ids (1 means it needs to be masked).
-            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
-            input = input - vocab_start_index
-            input[input_ids_mask] = 0
-            embeddings = super().forward(input)
-            embeddings[input_ids_mask] = 0.0
-            return embeddings
-class ColumnParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if embedding_dim % world_size != 0:
-                raise ValueError(
-                    f"embedding_dim ({embedding_dim}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-        else:
-            world_size = 1
-        super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs)
-class ParallelEmbeddings(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        vocab_size,
-        max_position_embeddings,
-        process_group,
-        padding_idx=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        """
-        If max_position_embeddings <= 0, there's no position embeddings
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.word_embeddings = VocabParallelEmbedding(
-            vocab_size,
-            embed_dim,
-            padding_idx=padding_idx,
-            process_group=process_group,
-            **factory_kwargs,
-        )
-        self.max_position_embeddings = max_position_embeddings
-        if self.max_position_embeddings > 0:
-            self.position_embeddings = ColumnParallelEmbedding(
-                max_position_embeddings,
-                embed_dim,
-                process_group=process_group,
-                **factory_kwargs,
-            )
-    def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False):
-        """
-        input_ids: (batch, seqlen)
-        position_ids: (batch, seqlen)
-        """
-        batch_size, seqlen = input_ids.shape
-        world_size = torch.distributed.get_world_size(self.process_group)
-        embeddings = self.word_embeddings(input_ids)
-        if self.max_position_embeddings > 0:
-            if position_ids is None:
-                position_ids = torch.arange(
-                    seqlen, dtype=torch.long, device=input_ids.device
-                )
-            position_embeddings = self.position_embeddings(position_ids)
-            if world_size <= 1:
-                embeddings = embeddings + position_embeddings
-            else:
-                partition_dim = self.position_embeddings.embedding_dim
-                rank = torch.distributed.get_rank(self.process_group)
-                embeddings[
-                    ..., rank * partition_dim : (rank + 1) * partition_dim
-                ] += position_embeddings
-        if combine_batch_seqlen_dim:
-            embeddings = rearrange(embeddings, "b s d -> (b s) d")
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return (
-            embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group)
-        )

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py DELETED Viewed

@@ -1,338 +0,0 @@
-# Copyright (c) 2023, Albert Gu, Tri Dao.
-import math
-from functools import partial
-import json
-import os
-import copy
-from collections import namedtuple
-import torch
-import torch.nn as nn
-from .config_mamba import MambaConfig
-from ..modules.mamba_simple import Mamba
-from ..modules.mamba2 import Mamba2
-from ..modules.mha import MHA
-from ..modules.mlp import GatedMLP
-from ..modules.block import Block
-from ..utils.generation import GenerationMixin
-from ..utils.hf import load_config_hf, load_state_dict_hf
-try:
-    from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-def create_block(
-    d_model,
-    d_intermediate,
-    ssm_cfg=None,
-    attn_layer_idx=None,
-    attn_cfg=None,
-    norm_epsilon=1e-5,
-    rms_norm=False,
-    residual_in_fp32=False,
-    fused_add_norm=False,
-    layer_idx=None,
-    device=None,
-    dtype=None,
-):
-    if ssm_cfg is None:
-        ssm_cfg = {}
-    if attn_layer_idx is None:
-        attn_layer_idx = []
-    if attn_cfg is None:
-        attn_cfg = {}
-    factory_kwargs = {"device": device, "dtype": dtype}
-    if layer_idx not in attn_layer_idx:
-        # Create a copy of the config to modify
-        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
-        ssm_layer = ssm_cfg.pop("layer", "Mamba1")
-        if ssm_layer not in ["Mamba1", "Mamba2"]:
-            raise ValueError(
-                f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2"
-            )
-        mixer_cls = partial(
-            Mamba2 if ssm_layer == "Mamba2" else Mamba,
-            layer_idx=layer_idx,
-            **ssm_cfg,
-            **factory_kwargs,
-        )
-    else:
-        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
-    norm_cls = partial(
-        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
-    )
-    if d_intermediate == 0:
-        mlp_cls = nn.Identity
-    else:
-        mlp_cls = partial(
-            GatedMLP,
-            hidden_features=d_intermediate,
-            out_features=d_model,
-            **factory_kwargs,
-        )
-    block = Block(
-        d_model,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        fused_add_norm=fused_add_norm,
-        residual_in_fp32=residual_in_fp32,
-    )
-    block.layer_idx = layer_idx
-    return block
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    if isinstance(module, nn.Linear):
-        if module.bias is not None:
-            if not getattr(module.bias, "_no_reinit", False):
-                nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                # We need to reinit p since this code could be called multiple times
-                # Having just p *= scale would repeatedly scale it down
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                with torch.no_grad():
-                    p /= math.sqrt(n_residuals_per_layer * n_layer)
-class MixerModel(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        d_intermediate: int,
-        vocab_size: int,
-        ssm_cfg=None,
-        attn_layer_idx=None,
-        attn_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    d_intermediate=d_intermediate,
-                    ssm_cfg=ssm_cfg,
-                    attn_layer_idx=attn_layer_idx,
-                    attn_cfg=attn_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-                n_residuals_per_layer=(
-                    1 if d_intermediate == 0 else 2
-                ),  # 2 if we have MLP
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(
-                batch_size, max_seqlen, dtype=dtype, **kwargs
-            )
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
-        hidden_states = self.embedding(input_ids)
-        residual = None
-        for layer in self.layers:
-            hidden_states, residual = layer(
-                hidden_states,
-                residual,
-                inference_params=inference_params,
-                **mixer_kwargs,
-            )
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            # Set prenorm=False here since we don't need the residual
-            hidden_states = layer_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm_f, RMSNorm),
-            )
-        return hidden_states
-class MambaLMHeadModel(nn.Module, GenerationMixin):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        d_intermediate = config.d_intermediate
-        vocab_size = config.vocab_size
-        ssm_cfg = config.ssm_cfg
-        attn_layer_idx = config.attn_layer_idx
-        attn_cfg = config.attn_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.backbone = MixerModel(
-            d_model=d_model,
-            n_layer=n_layer,
-            d_intermediate=d_intermediate,
-            vocab_size=vocab_size,
-            ssm_cfg=ssm_cfg,
-            attn_layer_idx=attn_layer_idx,
-            attn_cfg=attn_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        if self.config.tie_embeddings:
-            self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        **mixer_kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        hidden_states = self.backbone(
-            input_ids, inference_params=inference_params, **mixer_kwargs
-        )
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(
-            load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype)
-        )
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.__dict__, f, indent=4)

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py DELETED Viewed

@@ -1,659 +0,0 @@
-# Copyright (c) 2023, Tri Dao, Albert Gu.
-import torch
-import torch.nn.functional as F
-from ..utils.torch import custom_fwd, custom_bwd
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn = None
-    causal_conv1d_cuda = None
-from .triton.layer_norm import _layer_norm_fwd
-from .._ops import ops
-class SelectiveScanFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        u,
-        delta,
-        A,
-        B,
-        C,
-        D=None,
-        z=None,
-        delta_bias=None,
-        delta_softplus=False,
-        return_last_state=False,
-    ):
-        if u.stride(-1) != 1:
-            u = u.contiguous()
-        if delta.stride(-1) != 1:
-            delta = delta.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if C.stride(-1) != 1:
-            C = C.contiguous()
-        if z is not None and z.stride(-1) != 1:
-            z = z.contiguous()
-        if B.dim() == 3:
-            B = rearrange(B, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_B = True
-        if C.dim() == 3:
-            C = rearrange(C, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_C = True
-        out, x, *rest = ops.selective_scan_fwd(
-            u, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.has_z = z is not None
-        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
-        if not ctx.has_z:
-            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
-            return out if not return_last_state else (out, last_state)
-        else:
-            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
-            out_z = rest[0]
-            return out_z if not return_last_state else (out_z, last_state)
-    @staticmethod
-    def backward(ctx, dout, *args):
-        if not ctx.has_z:
-            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
-            z = None
-            out = None
-        else:
-            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        # Here we just pass in None and dz will be allocated in the C++ code.
-        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = ops.selective_scan_bwd(
-            u,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            z,
-            delta_bias,
-            dout,
-            x,
-            out,
-            None,
-            ctx.delta_softplus,
-            False,  # option to recompute out_z, not used here
-        )
-        dz = rest[0] if ctx.has_z else None
-        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
-        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
-        return (
-            du,
-            ddelta,
-            dA,
-            dB,
-            dC,
-            dD if D is not None else None,
-            dz,
-            ddelta_bias if delta_bias is not None else None,
-            None,
-            None,
-        )
-def rms_norm_forward(
-    x,
-    weight,
-    bias,
-    eps=1e-6,
-    is_rms_norm=True,
-):
-    # x (b l) d
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    weight = weight.contiguous()
-    if bias is not None:
-        bias = bias.contiguous()
-    y = _layer_norm_fwd(
-        x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm
-    )[0]
-    # y (b l) d
-    return y
-def selective_scan_fn(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """if return_last_state is True, returns (out, last_state)
-    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
-    not considered in the backward pass.
-    """
-    return SelectiveScanFn.apply(
-        u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state
-    )
-def selective_scan_ref(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """
-    u: r(B D L)
-    delta: r(B D L)
-    A: c(D N) or r(D N)
-    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    D: r(D)
-    z: r(B D L)
-    delta_bias: r(D), fp32
-    out: r(B D L)
-    last_state (optional): r(B D dstate) or c(B D dstate)
-    """
-    dtype_in = u.dtype
-    u = u.float()
-    delta = delta.float()
-    if delta_bias is not None:
-        delta = delta + delta_bias[..., None].float()
-    if delta_softplus:
-        delta = F.softplus(delta)
-    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
-    is_variable_B = B.dim() >= 3
-    is_variable_C = C.dim() >= 3
-    if A.is_complex():
-        if is_variable_B:
-            B = torch.view_as_complex(
-                rearrange(B.float(), "... (L two) -> ... L two", two=2)
-            )
-        if is_variable_C:
-            C = torch.view_as_complex(
-                rearrange(C.float(), "... (L two) -> ... L two", two=2)
-            )
-    else:
-        B = B.float()
-        C = C.float()
-    x = A.new_zeros((batch, dim, dstate))
-    ys = []
-    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
-    if not is_variable_B:
-        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
-    else:
-        if B.dim() == 3:
-            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
-        else:
-            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
-    if is_variable_C and C.dim() == 4:
-        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
-    last_state = None
-    for i in range(u.shape[2]):
-        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
-        if not is_variable_C:
-            y = torch.einsum("bdn,dn->bd", x, C)
-        else:
-            if C.dim() == 3:
-                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
-            else:
-                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
-        if i == u.shape[2] - 1:
-            last_state = x
-        if y.is_complex():
-            y = y.real * 2
-        ys.append(y)
-    y = torch.stack(ys, dim=2)  # (batch dim L)
-    out = y if D is None else y + u * rearrange(D, "d -> d 1")
-    if z is not None:
-        out = out * F.silu(z)
-    out = out.to(dtype=dtype_in)
-    return out if not return_last_state else (out, last_state)
-class MambaInnerFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B=None,
-        C=None,
-        D=None,
-        delta_bias=None,
-        B_proj_bias=None,
-        C_proj_bias=None,
-        delta_softplus=True,
-        checkpoint_lvl=1,
-        b_rms_weight=None,
-        c_rms_weight=None,
-        dt_rms_weight=None,
-        b_c_dt_rms_eps=1e-6,
-    ):
-        """
-        xz: (batch, dim, seqlen)
-        """
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        assert checkpoint_lvl in [0, 1]
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        if torch.is_autocast_enabled():
-            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            delta_proj_weight = delta_proj_weight.to(
-                dtype=torch.get_autocast_gpu_dtype()
-            )
-            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            out_proj_bias = (
-                out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if out_proj_bias is not None
-                else None
-            )
-        if xz.stride(-1) != 1:
-            xz = xz.contiguous()
-        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
-        x, z = xz.chunk(2, dim=1)
-        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
-        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-            x, conv1d_weight, conv1d_bias, None, None, None, True
-        )
-        # We're being very careful here about the layout, to avoid extra transposes.
-        # We want delta to have d as the slowest moving dimension
-        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-        x_dbl = F.linear(
-            rearrange(conv1d_out, "b d l -> (b l) d"), x_proj_weight
-        )  # (bl d)
-        delta = rearrange(
-            delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-        )
-        ctx.is_variable_B = B is None
-        ctx.is_variable_C = C is None
-        ctx.B_proj_bias_is_None = B_proj_bias is None
-        ctx.C_proj_bias_is_None = C_proj_bias is None
-        if B is None:  # variable B
-            B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl dstate)
-            if B_proj_bias is not None:
-                B = B + B_proj_bias.to(dtype=B.dtype)
-            if not A.is_complex():
-                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                B = rearrange(
-                    B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if B.stride(-1) != 1:
-                B = B.contiguous()
-        if C is None:  # variable C
-            C = x_dbl[:, -d_state:]  # (bl dstate)
-            if C_proj_bias is not None:
-                C = C + C_proj_bias.to(dtype=C.dtype)
-            if not A.is_complex():
-                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                C = rearrange(
-                    C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if C.stride(-1) != 1:
-                C = C.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if b_rms_weight is not None:
-            B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if c_rms_weight is not None:
-            C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if dt_rms_weight is not None:
-            delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-            delta = rms_norm_forward(
-                delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps
-            )
-            delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-        out, scan_intermediates, out_z = ops.selective_scan_fwd(
-            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.out_proj_bias_is_None = out_proj_bias is None
-        ctx.checkpoint_lvl = checkpoint_lvl
-        ctx.b_rms_weight = b_rms_weight
-        ctx.c_rms_weight = c_rms_weight
-        ctx.dt_rms_weight = dt_rms_weight
-        ctx.b_c_dt_rms_eps = b_c_dt_rms_eps
-        if (
-            checkpoint_lvl >= 1
-        ):  # Will recompute conv1d_out and delta in the backward pass
-            conv1d_out, delta = None, None
-        ctx.save_for_backward(
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        )
-        return F.linear(
-            rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias
-        )
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout):
-        # dout: (batch, seqlen, dim)
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        (
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        ) = ctx.saved_tensors
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        x, z = xz.chunk(2, dim=1)
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        if ctx.checkpoint_lvl == 1:
-            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-                x, conv1d_weight, conv1d_bias, None, None, None, True
-            )
-            delta = rearrange(
-                delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-            )
-            if dt_rms_weight is not None:
-                delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-                delta = rms_norm_forward(
-                    delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps
-                )
-                delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-            if b_rms_weight is not None:
-                # Recompute & RMSNorm B
-                B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                B = rms_norm_forward(B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps)
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            if c_rms_weight is not None:
-                # Recompute & RMSNorm C
-                C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                C = rms_norm_forward(C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps)
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
-        dx, dz = dxz.chunk(2, dim=1)
-        dout = rearrange(dout, "b l e -> e (b l)")
-        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
-        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = (
-            ops.selective_scan_bwd(
-                conv1d_out,
-                delta,
-                A,
-                B,
-                C,
-                D,
-                z,
-                delta_bias,
-                dout_y,
-                scan_intermediates,
-                out,
-                dz,
-                ctx.delta_softplus,
-                True,  # option to recompute out_z
-            )
-        )
-        dout_proj_weight = torch.einsum(
-            "eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)")
-        )
-        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
-        dD = dD if D is not None else None
-        dx_dbl = torch.empty_like(x_dbl)
-        dB_proj_bias = None
-        if ctx.is_variable_B:
-            if not A.is_complex():
-                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dB = rearrange(
-                    dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
-            dx_dbl[:, delta_rank : delta_rank + d_state] = dB  # (bl d)
-            dB = None
-        dC_proj_bias = None
-        if ctx.is_variable_C:
-            if not A.is_complex():
-                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dC = rearrange(
-                    dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
-            dx_dbl[:, -d_state:] = dC  # (bl d)
-            dC = None
-        ddelta = rearrange(ddelta, "b d l -> d (b l)")
-        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
-        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
-        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
-        dx_proj_weight = torch.einsum(
-            "Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d")
-        )
-        dconv1d_out = torch.addmm(
-            dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out
-        )
-        dconv1d_out = rearrange(
-            dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1]
-        )
-        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
-        # backward of conv1d with the backward of chunk).
-        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            x,
-            conv1d_weight,
-            conv1d_bias,
-            dconv1d_out,
-            None,
-            None,
-            None,
-            dx,
-            False,
-            True,
-        )
-        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
-        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
-        return (
-            dxz,
-            dconv1d_weight,
-            dconv1d_bias,
-            dx_proj_weight,
-            ddelta_proj_weight,
-            dout_proj_weight,
-            dout_proj_bias,
-            dA,
-            dB,
-            dC,
-            dD,
-            ddelta_bias if delta_bias is not None else None,
-            # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps
-            dB_proj_bias,
-            dC_proj_bias,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_inner_fn(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-    checkpoint_lvl=1,
-    b_rms_weight=None,
-    c_rms_weight=None,
-    dt_rms_weight=None,
-    b_c_dt_rms_eps=1e-6,
-):
-    return MambaInnerFn.apply(
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B,
-        C,
-        D,
-        delta_bias,
-        B_proj_bias,
-        C_proj_bias,
-        delta_softplus,
-        checkpoint_lvl,
-        b_rms_weight,
-        c_rms_weight,
-        dt_rms_weight,
-        b_c_dt_rms_eps,
-    )
-def mamba_inner_ref(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-):
-    assert (
-        causal_conv1d_fn is not None
-    ), "causal_conv1d_fn is not available. Please install causal-conv1d."
-    L = xz.shape[-1]
-    delta_rank = delta_proj_weight.shape[1]
-    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-    x, z = xz.chunk(2, dim=1)
-    x = causal_conv1d_fn(
-        x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, activation="silu"
-    )
-    # We're being very careful here about the layout, to avoid extra transposes.
-    # We want delta to have d as the slowest moving dimension
-    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-    x_dbl = F.linear(rearrange(x, "b d l -> (b l) d"), x_proj_weight)  # (bl d)
-    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
-    delta = rearrange(delta, "d (b l) -> b d l", l=L)
-    if B is None:  # variable B
-        B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl d)
-        if B_proj_bias is not None:
-            B = B + B_proj_bias.to(dtype=B.dtype)
-        if not A.is_complex():
-            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            B = rearrange(
-                B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    if C is None:  # variable B
-        C = x_dbl[:, -d_state:]  # (bl d)
-        if C_proj_bias is not None:
-            C = C + C_proj_bias.to(dtype=C.dtype)
-        if not A.is_complex():
-            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            C = rearrange(
-                C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    y = selective_scan_fn(
-        x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True
-    )
-    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py DELETED Viewed

@@ -1,1166 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# Implement dropout + residual + layer_norm / rms_norm.
-# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
-# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
-# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
-import math
-import warnings
-import torch
-import torch.nn.functional as F
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-def layer_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    out = F.layer_norm(
-        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
-    ).to(dtype)
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = F.layer_norm(
-            x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def rms_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
-        dtype
-    )
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = (
-            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def config_prune(configs):
-    if torch.version.hip:
-        try:
-            # set warp size based on gcn architecure
-            gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
-            if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
-                # radeon
-                warp_size = 32
-            else:
-                # instinct
-                warp_size = 64
-        except AttributeError as e:
-            # fall back to crude method to set warp size
-            device_name = torch.cuda.get_device_properties(0).name
-            if "instinct" in device_name.lower():
-                warp_size = 64
-            else:
-                warp_size = 32
-            warnings.warn(
-                f"{e}, warp size set to {warp_size} based on device name: {device_name}",
-                UserWarning,
-            )
-    else:
-        # cuda
-        warp_size = 32
-    max_block_sz = 1024
-    max_num_warps = max_block_sz // warp_size
-    pruned_configs = [config for config in configs if config.num_warps <= max_num_warps]
-    return pruned_configs
-configs_autotune = [
-    triton.Config({}, num_warps=1),
-    triton.Config({}, num_warps=2),
-    triton.Config({}, num_warps=4),
-    triton.Config({}, num_warps=8),
-    triton.Config({}, num_warps=16),
-    triton.Config({}, num_warps=32),
-]
-pruned_configs_autotune = config_prune(configs_autotune)
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
-@triton.jit
-def _layer_norm_fwd_1pass_kernel(
-    X,  # pointer to the input
-    Y,  # pointer to the output
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    RESIDUAL,  # pointer to the residual
-    X1,
-    W1,
-    B1,
-    Y1,
-    RESIDUAL_OUT,  # pointer to the residual
-    ROWSCALE,
-    SEEDS,  # Dropout seeds for each row
-    DROPOUT_MASK,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_res_row,
-    stride_res_out_row,
-    stride_x1_row,
-    stride_y1_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,  # Dropout probability
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_RESIDUAL: tl.constexpr,
-    STORE_RESIDUAL_OUT: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    STORE_DROPOUT_MASK: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_X1: tl.constexpr,
-    HAS_W1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    X += row * stride_x_row
-    Y += row * stride_y_row
-    if HAS_RESIDUAL:
-        RESIDUAL += row * stride_res_row
-    if STORE_RESIDUAL_OUT:
-        RESIDUAL_OUT += row * stride_res_out_row
-    if HAS_X1:
-        X1 += row * stride_x1_row
-    if HAS_W1:
-        Y1 += row * stride_y1_row
-    # Compute mean and variance
-    cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
-    if HAS_ROWSCALE:
-        rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-        x *= rowscale
-    if HAS_DROPOUT:
-        # Compute dropout mask
-        # 7 rounds is good enough, and reduces register pressure
-        keep_mask = (
-            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-        )
-        x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
-        if STORE_DROPOUT_MASK:
-            tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
-    if HAS_X1:
-        x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
-            x1 *= rowscale
-        if HAS_DROPOUT:
-            # Compute dropout mask
-            # 7 rounds is good enough, and reduces register pressure
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
-            if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
-        x += x1
-    if HAS_RESIDUAL:
-        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
-        x += residual
-    if STORE_RESIDUAL_OUT:
-        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
-    if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    else:
-        xbar = tl.where(cols < N, x, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
-    # Write output
-    tl.store(Y + cols, y, mask=mask)
-    if HAS_W1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if HAS_B1:
-            b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
-        y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
-        tl.store(Y1 + cols, y1, mask=mask)
-def _layer_norm_fwd(
-    x,
-    weight,
-    bias,
-    eps,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    dropout_p=0.0,
-    rowscale=None,
-    out_dtype=None,
-    residual_dtype=None,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    if residual is not None:
-        residual_dtype = residual.dtype
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    if residual is not None:
-        assert residual.stride(-1) == 1
-        assert residual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if x1 is not None:
-        assert x1.shape == x.shape
-        assert rowscale is None
-        assert x1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
-    assert y.stride(-1) == 1
-    if weight1 is not None:
-        y1 = torch.empty_like(y)
-        assert y1.stride(-1) == 1
-    else:
-        y1 = None
-    if (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        residual_out = torch.empty(
-            M,
-            N,
-            device=x.device,
-            dtype=residual_dtype if residual_dtype is not None else x.dtype,
-        )
-        assert residual_out.stride(-1) == 1
-    else:
-        residual_out = None
-    mean = (
-        torch.empty((M,), dtype=torch.float32, device=x.device)
-        if not is_rms_norm
-        else None
-    )
-    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
-    if dropout_p > 0.0:
-        seeds = torch.randint(
-            2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
-        )
-    else:
-        seeds = None
-    if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(
-            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
-        )
-    else:
-        dropout_mask = None
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[(M,)](
-            x,
-            y,
-            weight,
-            bias,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            y1,
-            residual_out,
-            rowscale,
-            seeds,
-            dropout_mask,
-            mean,
-            rstd,
-            x.stride(0),
-            y.stride(0),
-            residual.stride(0) if residual is not None else 0,
-            residual_out.stride(0) if residual_out is not None else 0,
-            x1.stride(0) if x1 is not None else 0,
-            y1.stride(0) if y1 is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            is_rms_norm,
-            BLOCK_N,
-            residual is not None,
-            residual_out is not None,
-            bias is not None,
-            dropout_p > 0.0,
-            dropout_mask is not None,
-            rowscale is not None,
-        )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if dropout_mask is not None and x1 is not None:
-        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
-    else:
-        dropout_mask1 = None
-    return (
-        y,
-        y1,
-        mean,
-        rstd,
-        residual_out if residual_out is not None else x,
-        seeds,
-        dropout_mask,
-        dropout_mask1,
-    )
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=[
-        "N",
-        "HAS_DRESIDUAL",
-        "STORE_DRESIDUAL",
-        "IS_RMS_NORM",
-        "HAS_BIAS",
-        "HAS_DROPOUT",
-    ],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
-# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
-@triton.jit
-def _layer_norm_bwd_kernel(
-    X,  # pointer to the input
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    Y,  # pointer to the output to be recomputed
-    DY,  # pointer to the output gradient
-    DX,  # pointer to the input gradient
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    DRESIDUAL,
-    W1,
-    DY1,
-    DX1,
-    DW1,
-    DB1,
-    DRESIDUAL_IN,
-    ROWSCALE,
-    SEEDS,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_dy_row,
-    stride_dx_row,
-    stride_dres_row,
-    stride_dy1_row,
-    stride_dx1_row,
-    stride_dres_in_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,
-    rows_per_program,
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_DRESIDUAL: tl.constexpr,
-    STORE_DRESIDUAL: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_DY1: tl.constexpr,
-    HAS_DX1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-    RECOMPUTE_OUTPUT: tl.constexpr,
-):
-    # Map the program id to the elements of X, DX, and DY it should compute.
-    row_block_id = tl.program_id(0)
-    row_start = row_block_id * rows_per_program
-    # Do not early exit if row_start >= M, because we need to write DW and DB
-    cols = tl.arange(0, BLOCK_N)
-    mask = cols < N
-    X += row_start * stride_x_row
-    if HAS_DRESIDUAL:
-        DRESIDUAL += row_start * stride_dres_row
-    if STORE_DRESIDUAL:
-        DRESIDUAL_IN += row_start * stride_dres_in_row
-    DY += row_start * stride_dy_row
-    DX += row_start * stride_dx_row
-    if HAS_DY1:
-        DY1 += row_start * stride_dy1_row
-    if HAS_DX1:
-        DX1 += row_start * stride_dx1_row
-    if RECOMPUTE_OUTPUT:
-        Y += row_start * stride_y_row
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if RECOMPUTE_OUTPUT and HAS_BIAS:
-        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
-    if HAS_DY1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_BIAS:
-        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_DY1:
-        dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-        if HAS_B1:
-            db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    row_end = min((row_block_id + 1) * rows_per_program, M)
-    for row in range(row_start, row_end):
-        # Load data to SRAM
-        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
-        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
-        if HAS_DY1:
-            dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
-        if not IS_RMS_NORM:
-            mean = tl.load(Mean + row)
-        rstd = tl.load(Rstd + row)
-        # Compute dx
-        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-        xhat = tl.where(mask, xhat, 0.0)
-        if RECOMPUTE_OUTPUT:
-            y = xhat * w + b if HAS_BIAS else xhat * w
-            tl.store(Y + cols, y, mask=mask)
-        wdy = w * dy
-        dw += dy * xhat
-        if HAS_BIAS:
-            db += dy
-        if HAS_DY1:
-            wdy += w1 * dy1
-            dw1 += dy1 * xhat
-            if HAS_B1:
-                db1 += dy1
-        if not IS_RMS_NORM:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            c2 = tl.sum(wdy, axis=0) / N
-            dx = (wdy - (xhat * c1 + c2)) * rstd
-        else:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            dx = (wdy - xhat * c1) * rstd
-        if HAS_DRESIDUAL:
-            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
-            dx += dres
-        # Write dx
-        if STORE_DRESIDUAL:
-            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
-        if HAS_DX1:
-            if HAS_DROPOUT:
-                keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                    > dropout_p
-                )
-                dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-            else:
-                dx1 = dx
-            tl.store(DX1 + cols, dx1, mask=mask)
-        if HAS_DROPOUT:
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-            dx *= rowscale
-        tl.store(DX + cols, dx, mask=mask)
-        X += stride_x_row
-        if HAS_DRESIDUAL:
-            DRESIDUAL += stride_dres_row
-        if STORE_DRESIDUAL:
-            DRESIDUAL_IN += stride_dres_in_row
-        if RECOMPUTE_OUTPUT:
-            Y += stride_y_row
-        DY += stride_dy_row
-        DX += stride_dx_row
-        if HAS_DY1:
-            DY1 += stride_dy1_row
-        if HAS_DX1:
-            DX1 += stride_dx1_row
-    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
-    if HAS_BIAS:
-        tl.store(DB + row_block_id * N + cols, db, mask=mask)
-    if HAS_DY1:
-        tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
-        if HAS_B1:
-            tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
-def _layer_norm_bwd(
-    dy,
-    x,
-    weight,
-    bias,
-    eps,
-    mean,
-    rstd,
-    dresidual=None,
-    dy1=None,
-    weight1=None,
-    bias1=None,
-    seeds=None,
-    dropout_p=0.0,
-    rowscale=None,
-    has_residual=False,
-    has_x1=False,
-    is_rms_norm=False,
-    x_dtype=None,
-    recompute_output=False,
-):
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    assert dy.stride(-1) == 1
-    assert dy.shape == (M, N)
-    if dresidual is not None:
-        assert dresidual.stride(-1) == 1
-        assert dresidual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if dy1 is not None:
-        assert weight1 is not None
-        assert dy1.shape == dy.shape
-        assert dy1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if seeds is not None:
-        assert seeds.is_contiguous()
-        assert seeds.shape == (M if not has_x1 else M * 2,)
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    dx = (
-        torch.empty_like(x)
-        if x_dtype is None
-        else torch.empty(M, N, dtype=x_dtype, device=x.device)
-    )
-    dresidual_in = (
-        torch.empty_like(x)
-        if has_residual
-        and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
-        else None
-    )
-    dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = (
-        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
-        if recompute_output
-        else None
-    )
-    if recompute_output:
-        assert (
-            weight1 is None
-        ), "recompute_output is not supported with parallel LayerNorm"
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
-    _db = (
-        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
-        if bias is not None
-        else None
-    )
-    _dw1 = torch.empty_like(_dw) if weight1 is not None else None
-    _db1 = torch.empty_like(_db) if bias1 is not None else None
-    rows_per_program = math.ceil(M / sm_count)
-    grid = (sm_count,)
-    with torch.cuda.device(x.device.index):
-        _layer_norm_bwd_kernel[grid](
-            x,
-            weight,
-            bias,
-            y,
-            dy,
-            dx,
-            _dw,
-            _db,
-            dresidual,
-            weight1,
-            dy1,
-            dx1,
-            _dw1,
-            _db1,
-            dresidual_in,
-            rowscale,
-            seeds,
-            mean,
-            rstd,
-            x.stride(0),
-            0 if not recompute_output else y.stride(0),
-            dy.stride(0),
-            dx.stride(0),
-            dresidual.stride(0) if dresidual is not None else 0,
-            dy1.stride(0) if dy1 is not None else 0,
-            dx1.stride(0) if dx1 is not None else 0,
-            dresidual_in.stride(0) if dresidual_in is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            rows_per_program,
-            is_rms_norm,
-            BLOCK_N,
-            dresidual is not None,
-            dresidual_in is not None,
-            bias is not None,
-            dropout_p > 0.0,
-        )
-    dw = _dw.sum(0).to(weight.dtype)
-    db = _db.sum(0).to(bias.dtype) if bias is not None else None
-    dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
-    db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return (
-        (dx, dw, db, dresidual_in, dx1, dw1, db1)
-        if not recompute_output
-        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
-    )
-class LayerNormFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        weight,
-        bias,
-        residual=None,
-        x1=None,
-        weight1=None,
-        bias1=None,
-        eps=1e-6,
-        dropout_p=0.0,
-        rowscale=None,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-        return_dropout_mask=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        if x1 is not None:
-            assert x1.shape == x_shape_og
-            assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = x1.reshape(-1, x1.shape[-1])
-            if x1.stride(-1) != 1:
-                x1 = x1.contiguous()
-        weight = weight.contiguous()
-        if bias is not None:
-            bias = bias.contiguous()
-        if weight1 is not None:
-            weight1 = weight1.contiguous()
-        if bias1 is not None:
-            bias1 = bias1.contiguous()
-        if rowscale is not None:
-            rowscale = rowscale.reshape(-1).contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
-            _layer_norm_fwd(
-                x,
-                weight,
-                bias,
-                eps,
-                residual,
-                x1,
-                weight1,
-                bias1,
-                dropout_p=dropout_p,
-                rowscale=rowscale,
-                residual_dtype=residual_dtype,
-                is_rms_norm=is_rms_norm,
-                return_dropout_mask=return_dropout_mask,
-            )
-        )
-        ctx.save_for_backward(
-            residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.dropout_p = dropout_p
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.has_x1 = x1 is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        y = y.reshape(x_shape_og)
-        y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = (
-            residual_out.reshape(x_shape_og) if residual_out is not None else None
-        )
-        dropout_mask = (
-            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        )
-        dropout_mask1 = (
-            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
-        )
-        if not return_dropout_mask:
-            if weight1 is None:
-                return y if not prenorm else (y, residual_out)
-            else:
-                return (y, y1) if not prenorm else (y, y1, residual_out)
-        else:
-            if weight1 is None:
-                return (
-                    (y, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, residual_out, dropout_mask, dropout_mask1)
-                )
-            else:
-                return (
-                    (y, y1, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, y1, residual_out, dropout_mask, dropout_mask1)
-                )
-    @staticmethod
-    def backward(ctx, dy, *args):
-        x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
-        dy = dy.reshape(-1, dy.shape[-1])
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if weight1 is not None:
-            dy1, args = args[0], args[1:]
-            dy1 = dy1.reshape(-1, dy1.shape[-1])
-            if dy1.stride(-1) != 1:
-                dy1 = dy1.contiguous()
-            assert dy1.shape == x.shape
-        else:
-            dy1 = None
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
-            dy,
-            x,
-            weight,
-            bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual,
-            dy1,
-            weight1,
-            bias1,
-            seeds,
-            ctx.dropout_p,
-            rowscale,
-            ctx.has_residual,
-            ctx.has_x1,
-            ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-        )
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dw,
-            db,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
-            dw1,
-            db1,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-        return_dropout_mask,
-    )
-def rms_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        True,
-        return_dropout_mask,
-    )
-class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if dropout_p > 0.0:
-            self.drop = torch.nn.Dropout(dropout_p)
-        else:
-            self.drop = None
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
-    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
-        return rms_norm_fn(
-            x,
-            self.weight,
-            self.bias,
-            residual=residual,
-            eps=self.eps,
-            dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
-            prenorm=prenorm,
-            residual_in_fp32=residual_in_fp32,
-        )
-class LayerNormLinearFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual=None,
-        eps=1e-6,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        norm_weight = norm_weight.contiguous()
-        if norm_bias is not None:
-            norm_bias = norm_bias.contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd(
-            x,
-            norm_weight,
-            norm_bias,
-            eps,
-            residual,
-            out_dtype=(
-                None
-                if not torch.is_autocast_enabled()
-                else torch.get_autocast_gpu_dtype()
-            ),
-            residual_dtype=residual_dtype,
-            is_rms_norm=is_rms_norm,
-        )
-        y = y.reshape(x_shape_og)
-        dtype = (
-            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
-        )
-        linear_weight = linear_weight.to(dtype)
-        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
-        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
-        # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(
-            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        ctx.linear_bias_is_none = linear_bias is None
-        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
-        dout = dout.reshape(-1, dout.shape[-1])
-        dy = F.linear(dout, linear_weight.t())
-        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = _layer_norm_bwd(
-            dy,
-            x,
-            norm_weight,
-            norm_bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual=dresidual,
-            has_residual=ctx.has_residual,
-            is_rms_norm=ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-            recompute_output=True,
-        )
-        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dnorm_weight,
-            dnorm_bias,
-            dlinear_weight,
-            dlinear_bias,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_linear_fn(
-    x,
-    norm_weight,
-    norm_bias,
-    linear_weight,
-    linear_bias,
-    residual=None,
-    eps=1e-6,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    return LayerNormLinearFn.apply(
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual,
-        eps,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-    )

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py DELETED Viewed

@@ -1,389 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or triton==2.2.0 or triton==2.3.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
-@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
-@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
-@triton.heuristics(
-    {
-        "HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"]
-        is not None
-    }
-)
-@triton.heuristics(
-    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
-)
-@triton.jit
-def _selective_scan_update_kernel(
-    # Pointers to matrices
-    state_ptr,
-    x_ptr,
-    dt_ptr,
-    dt_bias_ptr,
-    A_ptr,
-    B_ptr,
-    C_ptr,
-    D_ptr,
-    z_ptr,
-    out_ptr,
-    state_batch_indices_ptr,
-    # Matrix dimensions
-    batch,
-    nheads,
-    dim,
-    dstate,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_state_batch,
-    stride_state_head,
-    stride_state_dim,
-    stride_state_dstate,
-    stride_x_batch,
-    stride_x_head,
-    stride_x_dim,
-    stride_dt_batch,
-    stride_dt_head,
-    stride_dt_dim,
-    stride_dt_bias_head,
-    stride_dt_bias_dim,
-    stride_A_head,
-    stride_A_dim,
-    stride_A_dstate,
-    stride_B_batch,
-    stride_B_group,
-    stride_B_dstate,
-    stride_C_batch,
-    stride_C_group,
-    stride_C_dstate,
-    stride_D_head,
-    stride_D_dim,
-    stride_z_batch,
-    stride_z_head,
-    stride_z_dim,
-    stride_out_batch,
-    stride_out_head,
-    stride_out_dim,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    TIE_HDIM: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    HAS_D: tl.constexpr,
-    HAS_Z: tl.constexpr,
-    HAS_STATE_BATCH_INDICES: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_m = tl.program_id(axis=0)
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    if HAS_STATE_BATCH_INDICES:
-        state_batch_indices_ptr += pid_b
-        state_batch_idx = tl.load(state_batch_indices_ptr)
-        state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
-    else:
-        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
-    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
-    if HAS_DT_BIAS:
-        dt_bias_ptr += pid_h * stride_dt_bias_head
-    A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
-    if HAS_Z:
-        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
-    state_ptrs = state_ptr + (
-        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
-    )
-    x_ptrs = x_ptr + offs_m * stride_x_dim
-    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
-    if HAS_DT_BIAS:
-        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
-    if HAS_D:
-        D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (
-        offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
-    )
-    B_ptrs = B_ptr + offs_n * stride_B_dstate
-    C_ptrs = C_ptr + offs_n * stride_C_dstate
-    if HAS_D:
-        D_ptrs = D_ptr + offs_m * stride_D_dim
-    if HAS_Z:
-        z_ptrs = z_ptr + offs_m * stride_z_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    state = tl.load(
-        state_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-    )
-    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(
-            A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-        ).to(tl.float32)
-        dA = tl.exp(A * dt[:, None])
-    else:
-        dt = tl.load(dt_ptr).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptr).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(A_ptr).to(tl.float32)
-        dA = tl.exp(A * dt)  # scalar, not a matrix
-    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    if HAS_D:
-        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if HAS_Z:
-        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dB = B[None, :] * dt[:, None]
-    else:
-        dB = B * dt  # vector of size (dstate,)
-    state = state * dA + dB * x[:, None]
-    tl.store(
-        state_ptrs, state, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    )
-    out = tl.sum(state * C[None, :], axis=1)
-    if HAS_D:
-        out += x * D
-    if HAS_Z:
-        out *= z * tl.sigmoid(z)
-    tl.store(out_ptrs, out, mask=offs_m < dim)
-def selective_state_update(
-    state,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    state_batch_indices=None,
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    _, nheads, dim, dstate = state.shape
-    batch = x.shape[0]
-    if x.shape != (batch, nheads, dim):
-        print(f"{state.shape} {x.shape} {batch} {nheads} {dim}")
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-    if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch,)
-    out = torch.empty_like(x)
-    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
-    z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
-    # We don't want autotune since it will overwrite the state
-    # We instead tune by hand.
-    BLOCK_SIZE_M, num_warps = (
-        (32, 4)
-        if dstate <= 16
-        else (
-            (16, 4)
-            if dstate <= 32
-            else ((8, 4) if dstate <= 64 else ((4, 4) if dstate <= 128 else ((4, 8))))
-        )
-    )
-    tie_hdim = (
-        A.stride(-1) == 0
-        and A.stride(-2) == 0
-        and dt.stride(-1) == 0
-        and dt_bias.stride(-1) == 0
-    )
-    with torch.cuda.device(x.device.index):
-        _selective_scan_update_kernel[grid](
-            state,
-            x,
-            dt,
-            dt_bias,
-            A,
-            B,
-            C,
-            D,
-            z,
-            out,
-            state_batch_indices,
-            batch,
-            nheads,
-            dim,
-            dstate,
-            nheads // ngroups,
-            state.stride(0),
-            state.stride(1),
-            state.stride(2),
-            state.stride(3),
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
-            A.stride(0),
-            A.stride(1),
-            A.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            C.stride(0),
-            C.stride(1),
-            C.stride(2),
-            *(D.stride(0), D.stride(1)) if D is not None else 0,
-            z_strides[0],
-            z_strides[1],
-            z_strides[2],
-            out.stride(0),
-            out.stride(1),
-            out.stride(2),
-            dt_softplus,
-            tie_hdim,
-            BLOCK_SIZE_M,
-            num_warps=num_warps,
-        )
-    if not has_heads:
-        out = out.squeeze(1)
-    return out
-def selective_state_update_ref(
-    state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    batch, nheads, dim, dstate = state.shape
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-        dt = dt + dt_bias
-    dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(
-        rearrange(dt, "b h d -> b h d 1") * A
-    )  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
-        B, "b h n -> b h 1 n"
-    )  # (batch, nheads, dim, dstate)
-    state.copy_(
-        state * dA + dB * rearrange(x, "b h d -> b h d 1")
-    )  # (batch, dim, dstate
-    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
-    if D is not None:
-        out += (x * D).to(out.dtype)
-    out = (out if z is None else out * F.silu(z)).to(x.dtype)
-    if not has_heads:
-        out = out.squeeze(1)
-    return out

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py DELETED Viewed

The diff for this file is too large to render. See raw diff

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py DELETED Viewed

@@ -1,2012 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_SIZE_H": 1}),
-        triton.Config({"BLOCK_SIZE_H": 2}),
-        triton.Config({"BLOCK_SIZE_H": 4}),
-        triton.Config({"BLOCK_SIZE_H": 8}),
-        triton.Config({"BLOCK_SIZE_H": 16}),
-        triton.Config({"BLOCK_SIZE_H": 32}),
-        triton.Config({"BLOCK_SIZE_H": 64}),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_fwd_kernel(
-    # Pointers to matrices
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    dt_out_ptr,
-    dA_cumsum_ptr,
-    # Matrix dimension
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_dt_out_batch,
-    stride_dt_out_chunk,
-    stride_dt_out_head,
-    stride_dt_out_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    dt_out_ptrs = dt_out_ptr + (
-        offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize
-    )
-    dA_cs_ptrs = dA_cumsum_ptr + (
-        offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize
-    )
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt = tl.where(dt <= 20.0, softplus(dt), dt)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    tl.store(
-        dt_out_ptrs,
-        dt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    dA = dt * A[:, None]
-    dA_cs = tl.cumsum(dA, axis=1)
-    tl.store(
-        dA_cs_ptrs,
-        dA_cs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_H": 1}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 2}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 4}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 8}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 16}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 32}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 64}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_bwd_kernel(
-    # Pointers to matrices
-    ddA_ptr,
-    ddt_out_ptr,
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    ddt_ptr,
-    dA_ptr,
-    ddt_bias_ptr,
-    # Matrix dimensions
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_ddA_batch,
-    stride_ddA_chunk,
-    stride_ddA_head,
-    stride_ddA_csize,
-    stride_ddt_out_batch,
-    stride_ddt_out_chunk,
-    stride_ddt_out_head,
-    stride_ddt_out_csize,
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_ddt_batch,
-    stride_ddt_seqlen,
-    stride_ddt_head,
-    stride_dA_head,
-    stride_ddt_bias_head,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    ddt_out_ptr += pid_b * stride_ddt_out_batch + pid_c * stride_ddt_out_chunk
-    ddA_ptr += pid_b * stride_ddA_batch + pid_c * stride_ddA_chunk
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    ddt_ptr += pid_b * stride_ddt_batch + pid_c * chunk_size * stride_ddt_seqlen
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    ddt_out_ptrs = ddt_out_ptr + (
-        offs_h[:, None] * stride_ddt_out_head + offs_c[None, :] * stride_ddt_out_csize
-    )
-    ddA_ptrs = ddA_ptr + (
-        offs_h[:, None] * stride_ddA_head + offs_c[None, :] * stride_ddA_csize
-    )
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    ddt_ptrs = ddt_ptr + (
-        offs_h[:, None] * stride_ddt_head + offs_c[None, :] * stride_ddt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    ddA = tl.load(
-        ddA_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    ddt_out = tl.load(
-        ddt_out_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    ddt = ddA * A[:, None] + ddt_out
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt_presoftplus = dt
-        dt = tl.where(dt <= 20.0, softplus(dt), ddt)
-    clamp_mask = (dt < dt_min) | (dt > dt_max)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    ddt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), ddt, 0.0
-    )
-    ddt = tl.where(clamp_mask, 0.0, ddt)
-    if DT_SOFTPLUS:
-        ddt = tl.where(dt_presoftplus <= 20.0, ddt * tl.sigmoid(dt_presoftplus), ddt)
-    tl.store(
-        ddt_ptrs,
-        ddt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-    )
-    dA = tl.sum(ddA * dt, axis=1)
-    tl.atomic_add(dA_ptr + offs_h * stride_dA_head, dA, mask=offs_h < nheads)
-    if HAS_DT_BIAS:
-        ddt_bias = tl.sum(ddt, axis=1)
-        tl.atomic_add(
-            ddt_bias_ptr + offs_h * stride_ddt_bias_head, ddt_bias, mask=offs_h < nheads
-        )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_fwd_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    states_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_states_batch,
-    stride_states_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    if HAS_SEQ_IDX:
-        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    if HAS_SEQ_IDX:
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        if HAS_SEQ_IDX:
-            seq_idx_k = tl.load(
-                seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1
-            )
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        if not HAS_SEQ_IDX:
-            scale = tl.exp((dA_cs_last - dA_cs_k)) * dt_k
-        else:
-            scale = tl.where(
-                seq_idx_k == seq_idx_last, tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0
-            )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-        if HAS_SEQ_IDX:
-            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += (
-        pid_b * stride_states_batch
-        + pid_c * stride_states_chunk
-        + pid_h * stride_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    dx_ptr,
-    ddt_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-        tl.float32
-    )
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    acc *= tl.exp(dA_cs_last - dA_cs_m)[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    ddA_cs = -(ddt * dt_m)
-    ddA_cs_last = -tl.sum(ddA_cs)
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(ddA_cumsum_ptr + (chunk_size - 1) * stride_ddA_cs_csize, ddA_cs_last)
-    dx = (acc * dt_m[:, None]).to(dx_ptr.dtype.element_ty)
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "dstate", "hdim"],
-)
-@triton.jit
-def _chunk_state_bwd_db_kernel(
-    # Pointers to matrices
-    x_ptr,
-    dstates_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    db_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    dstate,
-    hdim,
-    batch,
-    seqlen,
-    nheads,
-    nheads_per_program,
-    ngroups,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_db_batch,
-    stride_db_seqlen,
-    stride_db_split,
-    stride_db_group,
-    stride_db_dstate,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_DDA_CS: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_sg = tl.program_id(axis=2)
-    pid_s = pid_sg // ngroups
-    pid_g = pid_sg - pid_s * ngroups
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_x_head
-    )
-    db_ptr += (
-        pid_b * stride_db_batch
-        + pid_c * chunk_size * stride_db_seqlen
-        + pid_g * stride_db_group
-        + pid_s * stride_db_split
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-        * stride_states_head
-    )
-    dt_ptr += (
-        pid_b * stride_dt_batch
-        + pid_c * stride_dt_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
-    )
-    if HAS_DDA_CS:
-        b_ptr += (
-            pid_b * stride_b_batch
-            + pid_c * chunk_size * stride_b_seqlen
-            + pid_g * stride_b_head
-        )
-        ddA_cumsum_ptr += (
-            pid_b * stride_ddA_cs_batch
-            + pid_c * stride_ddA_cs_chunk
-            + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-            * stride_ddA_cs_head
-        )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_k[None, :] * stride_x_hdim
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_dstate + offs_k[:, None] * stride_states_hdim
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    if HAS_DDA_CS:
-        b_ptrs = b_ptr + (
-            offs_m[:, None] * stride_b_seqlen + offs_n[None, :] * stride_b_dstate
-        )
-        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    if HAS_DDA_CS:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-    if HAS_SEQ_IDX:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    nheads_iter = min(
-        nheads_per_program, nheads // ngroups - pid_s * nheads_per_program
-    )
-    for h in range(nheads_iter):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = dstates.to(x_ptrs.dtype.element_ty)
-        db = tl.dot(x, dstates)
-        dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-            tl.float32
-        )
-        dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-            tl.float32
-        )
-        dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-        if not HAS_SEQ_IDX:
-            scale = tl.exp(dA_cs_last - dA_cs_m)
-        else:
-            scale = tl.where(
-                seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0
-            )
-        db *= (scale * dt_m)[:, None]
-        if HAS_DDA_CS:
-            # This is the gradient wrt (dA_cs_last - dA_cs_m), i.e. the exclusive reverse cumsum
-            ddA_cs = tl.sum(db * b, axis=1)
-            tl.atomic_add(
-                ddA_cumsum_ptrs + stride_ddA_cs_csize,
-                ddA_cs,
-                mask=offs_m < chunk_size - 1,
-            )
-        acc += db
-        x_ptrs += stride_x_head
-        dstates_ptrs += stride_states_head
-        dt_ptrs += stride_dt_head
-        dA_cumsum_ptr += stride_dA_cs_head
-        dA_cumsum_ptrs += stride_dA_cs_head
-        if HAS_DDA_CS:
-            ddA_cumsum_ptrs += stride_ddA_cs_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    # if HAS_SEQ_IDX:
-    #     seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-    #     seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-    #     acc = tl.where(seq_idx_m[:, None] == seq_idx_last, acc, 0.0)
-    db_ptrs = db_ptr + (
-        offs_m[:, None] * stride_db_seqlen + offs_n[None, :] * stride_db_dstate
-    )
-    tl.store(
-        db_ptrs,
-        acc,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-    )
-@triton.autotune(
-    configs=[
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_ddAcs_stable_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    acc *= scale[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    # ddA_cs = -(ddt * dt_m)
-    # Triton 2.2.0 errors if we have the cumsum here, so we just write it out
-    # then call torch.cumsum outside this kernel.
-    # ddA_cs = tl.cumsum(ddt * dt_m)
-    ddA_cs = ddt * dt_m
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    # tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(
-        ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_varlen_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    chunk_states_ptr,
-    cu_seqlens_ptr,
-    states_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_chunk_states_chunk,
-    stride_chunk_states_head,
-    stride_chunk_states_hdim,
-    stride_chunk_states_dstate,
-    stride_states_batch,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
-    pid_c = (end_idx - 1) // chunk_size
-    b_ptr += (
-        pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    chunk_states_ptr += (
-        pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(
-        dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
-    ).to(tl.float32)
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    chunk_size_limit = end_idx - pid_c * chunk_size
-    start_idx = tl.load(cu_seqlens_ptr + pid_b)
-    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim)
-            & (offs_k[None, :] < chunk_size_limit - k)
-            & (offs_k[None, :] >= start_idx_cur - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k)
-            & (offs_n[None, :] < dstate)
-            & (offs_k[:, None] >= start_idx_cur - k),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        scale = tl.where(
-            (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
-            tl.exp((dA_cs_last - dA_cs_k)) * dt_k,
-            0.0,
-        )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
-    if start_idx < pid_c * chunk_size:
-        chunk_states_ptrs = chunk_states_ptr + (
-            offs_m[:, None] * stride_chunk_states_hdim
-            + offs_n[None, :] * stride_chunk_states_dstate
-        )
-        chunk_states = tl.load(
-            chunk_states_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        # scale = tl.where(start_idx < pid_c * chunk_size, tl.exp(dA_cs_last), 0.0)
-        scale = tl.exp(dA_cs_last)
-        acc += chunk_states * scale
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-def _chunk_cumsum_fwd(
-    dt, A, chunk_size, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))
-):
-    batch, seqlen, nheads = dt.shape
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-    nchunks = math.ceil(seqlen / chunk_size)
-    dt_out = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    dA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
-            dt,
-            A,
-            dt_bias,
-            dt_out,
-            dA_cumsum,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            dt_out.stride(0),
-            dt_out.stride(2),
-            dt_out.stride(1),
-            dt_out.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return dA_cumsum, dt_out
-def _chunk_cumsum_bwd(
-    ddA,
-    ddt_out,
-    dt,
-    A,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    ddt=None,
-):
-    batch, seqlen, nheads = dt.shape
-    _, _, nchunks, chunk_size = ddA.shape
-    assert ddA.shape == (batch, nheads, nchunks, chunk_size)
-    assert ddt_out.shape == (batch, nheads, nchunks, chunk_size)
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-        ddt_bias = torch.empty_like(dt_bias, dtype=torch.float32)
-    else:
-        ddt_bias = None
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-    else:
-        ddt = torch.empty_like(dt)
-    dA = torch.empty_like(A, dtype=torch.float32)
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_bwd_kernel[grid_chunk_cs](
-            ddA,
-            ddt_out,
-            dt,
-            A,
-            dt_bias,
-            ddt,
-            dA,
-            ddt_bias,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            ddA.stride(0),
-            ddA.stride(2),
-            ddA.stride(1),
-            ddA.stride(3),
-            ddt_out.stride(0),
-            ddt_out.stride(2),
-            ddt_out.stride(1),
-            ddt_out.stride(3),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            ddt.stride(0),
-            ddt.stride(1),
-            ddt.stride(2),
-            dA.stride(0),
-            ddt_bias.stride(0) if ddt_bias is not None else 0,
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return ddt, dA, ddt_bias
-def _chunk_state_fwd(
-    B, x, dt, dA_cumsum, seq_idx=None, states=None, states_in_fp32=True
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if states is not None:
-        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
-    else:
-        states_dtype = torch.float32 if states_in_fp32 else B.dtype
-        states = torch.empty(
-            (batch, nchunks, nheads, headdim, dstate),
-            device=x.device,
-            dtype=states_dtype,
-        )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_fwd_kernel[grid](
-            x,
-            B,
-            states,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            headdim,
-            dstate,
-            chunk_size,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-            states.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    return states
-def _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates, dx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if dx is not None:
-        assert dx.shape == x.shape
-    else:
-        dx = torch.empty_like(x)
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dA_cumsum.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            dx,
-            ddt,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    return dx, ddt.to(dt.dtype), ddA_cumsum.to(dA_cumsum.dtype)
-def _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=None, B=None, ngroups=1):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    dstate = dstates.shape[-1]
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B is not None:
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        B_strides = (B.stride(0), B.stride(1), B.stride(2), B.stride(3))
-        # Use torch.empty since the Triton kernel will call init_to_zero
-        ddA_cumsum = torch.empty(
-            batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-        )
-        ddA_cumsum_strides = (
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-        )
-    else:
-        B_strides = (0, 0, 0, 0)
-        ddA_cumsum = None
-        ddA_cumsum_strides = (0, 0, 0, 0)
-    nheads_ngroups_ratio = nheads // ngroups
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    nheads_per_program = max(
-        min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1
-    )
-    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
-    dB = torch.empty(
-        batch, seqlen, nsplits, ngroups, dstate, device=x.device, dtype=torch.float32
-    )
-    grid_db = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nsplits * ngroups,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_db_kernel[grid_db](
-            x,
-            dstates,
-            B,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            dB,
-            ddA_cumsum,
-            chunk_size,
-            dstate,
-            headdim,
-            batch,
-            seqlen,
-            nheads,
-            nheads_per_program,
-            ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            *B_strides,
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            dB.stride(0),
-            dB.stride(1),
-            dB.stride(2),
-            dB.stride(3),
-            dB.stride(4),
-            *ddA_cumsum_strides,
-            HAS_DDA_CS=ddA_cumsum is not None,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
-        )
-    dB = dB.sum(2)
-    if ddA_cumsum is not None:
-        # The first element of ddA_cumsum is always zero, since that dA_cumsum does not contribute
-        # to the state of the chunk.
-        # torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-        # But it's easier to just do the cumsum for all elements, the result will be the same.
-        torch.cumsum(ddA_cumsum, dim=-1, out=ddA_cumsum)
-    return dB if B is None else (dB, ddA_cumsum)
-def _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    # Use torch.empty since the Triton kernel will call init_to_zero
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-    )
-    grid_ddtcs = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_ddAcs_stable_kernel[grid_ddtcs](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_M=max(triton.next_power_of_2(chunk_size), 16),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-    return ddA_cumsum
-def chunk_state_varlen(B, x, dt, dA_cumsum, cu_seqlens, chunk_states):
-    total_seqlen, nheads, headdim = x.shape
-    _, nchunks, chunk_size = dt.shape
-    _, ngroups, dstate = B.shape
-    batch = cu_seqlens.shape[0] - 1
-    cu_seqlens = cu_seqlens.contiguous()
-    assert nheads % ngroups == 0
-    assert B.shape == (total_seqlen, ngroups, dstate)
-    assert dt.shape == (nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
-    states = torch.empty(
-        batch,
-        nheads,
-        headdim,
-        dstate,
-        dtype=chunk_states.dtype,
-        device=chunk_states.device,
-    )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_varlen_kernel[grid](
-            x,
-            B,
-            dt,
-            dA_cumsum,
-            chunk_states,
-            cu_seqlens,
-            states,
-            headdim,
-            dstate,
-            chunk_size,
-            total_seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            dt.stride(1),
-            dt.stride(0),
-            dt.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            chunk_states.stride(0),
-            chunk_states.stride(1),
-            chunk_states.stride(2),
-            chunk_states.stride(3),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-        )
-    return states
-class ChunkStateFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, B, x, dt, dA_cumsum, states_in_fp32=True):
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        assert seqlen <= nchunks * chunk_size
-        _, _, ngroups, dstate = B.shape
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        assert dt.shape == (batch, nheads, nchunks, chunk_size)
-        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if (
-            x.stride(-1) != 1 and x.stride(1) != 1
-        ):  # Either M or K dimension should be contiguous
-            x = x.contiguous()
-        states = _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=states_in_fp32)
-        ctx.save_for_backward(B, x, dt, dA_cumsum)
-        return states
-    @staticmethod
-    def backward(ctx, dstates):
-        B, x, dt, dA_cumsum = ctx.saved_tensors
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        _, _, ngroups, dstate = B.shape
-        assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-        if dstates.stride(-1) != 1:
-            dstates = dstates.contiguous()
-        dx, ddt, ddA_cumsum = _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates)
-        dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, ngroups=ngroups)
-        dB = dB.to(B.dtype)
-        return dB, dx, ddt, ddA_cumsum, None
-def chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    return ChunkStateFn.apply(B, x, dt, dA_cumsum, states_in_fp32)
-def chunk_state_ref(B, x, dt, dA_cumsum):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    # Check constraints.
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    _, _, nchunks, chunk_size = dt.shape
-    assert seqlen <= nchunks * chunk_size
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    ngroups = B.shape[2]
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
-    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-    if seqlen < nchunks * chunk_size:
-        x = F.pad(x, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-        B = F.pad(B, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-    x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
-    B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
-    decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum(
-        "bclhn,bhcl,bhcl,bclhp->bchpn",
-        B.to(x.dtype),
-        decay_states.to(x.dtype),
-        dt.to(x.dtype),
-        x,
-    )

build/torch25-cxx11-cu121-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py DELETED Viewed

@@ -1,1884 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-from typing import Optional
-import math
-from packaging import version
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_cuda = None, None
-from .ssd_bmm import _bmm_chunk_fwd, _bmm_chunk_bwd
-from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_cumsum_bwd
-from .ssd_chunk_state import _chunk_state_fwd, _chunk_state_bwd_db
-from .ssd_chunk_state import _chunk_state_bwd_ddAcs_stable
-from .ssd_chunk_state import chunk_state, chunk_state_ref
-from .ssd_chunk_state import chunk_state_varlen
-from .ssd_state_passing import _state_passing_fwd, _state_passing_bwd
-from .ssd_state_passing import state_passing, state_passing_ref
-from .ssd_chunk_scan import _chunk_scan_fwd, _chunk_scan_bwd_dz, _chunk_scan_bwd_dstates
-from .ssd_chunk_scan import _chunk_scan_bwd_dC, _chunk_scan_bwd_dcb
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_stable
-from .ssd_chunk_scan import chunk_scan, chunk_scan_ref
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_prev
-from .layernorm_gated import rmsnorm_fn, _layer_norm_fwd, _layer_norm_bwd
-from .k_activations import _swiglu_fwd, _swiglu_bwd
-TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_scan_chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    cb_ptr,
-    dout_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    D_ptr,
-    b_ptr,
-    dstates_ptr,
-    dx_ptr,
-    ddt_ptr,
-    dD_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_cb_batch,
-    stride_cb_chunk,
-    stride_cb_head,
-    stride_cb_csize_m,
-    stride_cb_csize_k,
-    stride_dout_batch,
-    stride_dout_seqlen,
-    stride_dout_head,
-    stride_dout_hdim,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_D_head,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_dstates_head,
-    stride_dstates_hdim,
-    stride_dstates_dstate,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_dD_batch,
-    stride_dD_chunk,
-    stride_dD_head,
-    stride_dD_csize,
-    stride_dD_hdim,
-    # Meta-parameters
-    HAS_D: tl.constexpr,
-    D_HAS_HDIM: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-    IS_TRITON_22: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    cb_ptr += (
-        pid_b * stride_cb_batch
-        + pid_c * stride_cb_chunk
-        + (pid_h // nheads_ngroups_ratio) * stride_cb_head
-    )
-    dout_ptr += (
-        pid_b * stride_dout_batch
-        + pid_c * chunk_size * stride_dout_seqlen
-        + pid_h * stride_dout_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_dstates_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize,
-        mask=offs_m < chunk_size_limit,
-        other=0.0,
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    # Might be faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    # However, we're getting error with the Triton compiler 2.1.0 for that code path:
-    # Unexpected mma -> mma layout conversion
-    # Triton 2.2.0 fixes this
-    offs_dstate = tl.arange(
-        0,
-        (
-            BLOCK_SIZE_DSTATE
-            if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128
-            else BLOCK_SIZE_K
-        ),
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_dstate[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_dstates_hdim
-        + offs_dstate[:, None] * stride_dstates_dstate
-    )
-    if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_dstate[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates) * scale[:, None]
-    else:
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_dstate[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_dstate[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_dstates_dstate
-        acc *= scale[:, None]
-    # x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    # x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    # dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    # dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    # ddt = tl.sum(acc * x, axis=1) * dt_m
-    # ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    # tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    cb_ptrs = cb_ptr + (
-        offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k
-    )
-    dout_ptrs = dout_ptr + (
-        offs_k[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    K_MAX = chunk_size_limit
-    K_MIN = pid_m * BLOCK_SIZE_M
-    cb_ptrs += K_MIN * stride_cb_csize_k
-    dout_ptrs += K_MIN * stride_dout_seqlen
-    dA_cumsum_ptrs += K_MIN * stride_dA_cs_csize
-    for k in range(K_MIN, K_MAX, BLOCK_SIZE_K):
-        k = tl.multiple_of(k, BLOCK_SIZE_K)
-        # For some reason setting mask to (offs_m[:, None] < chunk_size_limit) is much slower
-        cb = tl.load(
-            cb_ptrs,
-            mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < K_MAX - k),
-            other=0.0,
-        )
-        dout = tl.load(
-            dout_ptrs,
-            mask=(offs_k[:, None] < K_MAX - k) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < K_MAX - k, other=0.0).to(
-            tl.float32
-        )
-        cb *= tl.exp(dA_cs_k[None, :] - dA_cs_m[:, None])
-        # If we don't have the (k + offs_k[None, :] < K_MAX) mask, for indices outside this range,
-        # we might have dA_cs_m = 0.0 and dA_cs_k very negative, and tl.exp will return inf.
-        # Multiplying with cb, which is 0.0 outside the range, will make the result NaN.
-        # This will cause NaN in acc, and hence NaN in dx and ddt.
-        mask = (k + offs_k[None, :] >= offs_m[:, None]) & (k + offs_k[None, :] < K_MAX)
-        cb = tl.where(mask, cb, 0.0)
-        cb = cb.to(dout_ptr.dtype.element_ty)
-        acc += tl.dot(cb, dout)
-        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
-        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    dx = acc * dt_m[:, None]
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    if HAS_D:
-        dout_res_ptrs = dout_ptr + (
-            offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-        )
-        dout_res = tl.load(
-            dout_res_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-            other=0.0,
-        ).to(tl.float32)
-        if D_HAS_HDIM:
-            D = tl.load(
-                D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0
-            ).to(tl.float32)
-        else:
-            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
-        dx += dout_res * D
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_D:
-        dD_ptr += (
-            pid_b * stride_dD_batch
-            + pid_c * stride_dD_chunk
-            + pid_h * stride_dD_head
-            + pid_m * stride_dD_csize
-        )
-        if D_HAS_HDIM:
-            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
-            dD = tl.sum(dout_res * x, axis=0)
-            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
-        else:
-            dD = tl.sum(dout_res * x)
-            tl.store(dD_ptr, dD)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-def _chunk_scan_chunk_state_bwd_dx(
-    x, dt, dA_cumsum, B, CB, dout, dstates, D=None, seq_idx=None, dx=None
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert CB.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dout.shape == x.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-        assert D.stride(-1) == 1
-        BLOCK_SIZE_min = 32
-        dD = torch.empty(
-            triton.cdiv(chunk_size, BLOCK_SIZE_min),
-            batch,
-            nchunks,
-            nheads,
-            headdim if D.dim() == 2 else 1,
-            device=D.device,
-            dtype=torch.float32,
-        )
-    else:
-        dD = None
-    dD_strides = (
-        (dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
-        if D is not None
-        else (0, 0, 0, 0, 0)
-    )
-    if dx is None:
-        dx = torch.empty_like(x)
-    else:
-        assert dx.shape == x.shape
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_scan_chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            CB,
-            dout,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            D,
-            B,
-            dstates,
-            dx,
-            ddt,
-            dD,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            CB.stride(0),
-            CB.stride(1),
-            CB.stride(2),
-            CB.stride(-1),
-            CB.stride(-2),
-            dout.stride(0),
-            dout.stride(1),
-            dout.stride(2),
-            dout.stride(3),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            D.stride(0) if D is not None else 0,
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            dD_strides[1],
-            dD_strides[2],
-            dD_strides[3],
-            dD_strides[0],
-            dD_strides[4],
-            D is not None,
-            D.dim() == 2 if D is not None else True,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-            IS_TRITON_22=TRITON_22
-        )
-    if D is not None:
-        BLOCK_SIZE_actual = _chunk_scan_chunk_state_bwd_dx_kernel.best_config.kwargs[
-            "BLOCK_SIZE_M"
-        ]
-        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
-        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
-        if D.dim() == 1:
-            dD = rearrange(dD, "h 1 -> h")
-    return dx, ddt.to(dtype=dt.dtype), dD
-def _mamba_chunk_scan_combined_fwd(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert C.shape == B.shape
-    if z is not None:
-        assert z.shape == x.shape
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if (
-        x.stride(-1) != 1 and x.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        x = x.contiguous()
-    if (
-        z is not None and z.stride(-1) != 1 and z.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        z = z.contiguous()
-    if D is not None and D.stride(-1) != 1:
-        D = D.contiguous()
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    # # (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, nheads, chunk_size, chunk_size)
-    # dA_cumsum_tmp0, dt_tmp0 = _chunk_cumsum_fwd(dt[:, :147], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp1, dt_tmp1 = _chunk_cumsum_fwd(dt[:, 147:], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp2, dt_tmp2 = _chunk_cumsum_fwd(dt[:, 147:256], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit
-    )
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    # states_tmp0 = _chunk_state_fwd(B[:, :147], x[:, :147], dt_tmp0, dA_cumsum_tmp0, states_in_fp32=True)
-    # states_tmp1 = _chunk_state_fwd(B[:, 147:], x[:, 147:], dt_tmp1, dA_cumsum_tmp1, states_in_fp32=True)
-    # states_tmp2 = _chunk_state_fwd(B[:, 147:256], x[:, 147:256], dt_tmp2, dA_cumsum_tmp2, states_in_fp32=True)
-    states, final_states = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-        out_dtype=C.dtype,
-    )
-    states, final_states = [
-        rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]
-    ]
-    # states_tmp0 = rearrange(_state_passing_fwd(rearrange(states_tmp0, "... p n -> ... (p n)"), dA_cumsum_tmp0[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    # states_tmp1 = rearrange(_state_passing_fwd(rearrange(states_tmp1, "... p n -> ... (p n)"), dA_cumsum_tmp1[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    out, out_x = _chunk_scan_fwd(
-        CB, x, dt, dA_cumsum, C, states, D=D, z=z, seq_idx=seq_idx
-    )
-    if cu_seqlens is None:
-        return out, out_x, dt, dA_cumsum, states, final_states
-    else:
-        assert (
-            batch == 1
-        ), "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
-        varlen_states = chunk_state_varlen(
-            B.squeeze(0),
-            x.squeeze(0),
-            dt.squeeze(0),
-            dA_cumsum.squeeze(0),
-            cu_seqlens,
-            states.squeeze(0),
-        )
-        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
-def _mamba_chunk_scan_combined_bwd(
-    dout,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    out,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    dfinal_states=None,
-    seq_idx=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    dx=None,
-    ddt=None,
-    dB=None,
-    dC=None,
-    dz=None,
-    recompute_output=False,
-):
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    batch, seqlen, nheads, headdim = x.shape
-    nchunks = math.ceil(seqlen / chunk_size)
-    _, _, ngroups, dstate = B.shape
-    assert dout.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert C.shape == B.shape
-    assert out.shape == x.shape
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if dx is not None:
-        assert dx.shape == x.shape
-    if dB is not None:
-        assert dB.shape == B.shape
-        dB_given = dB
-    else:
-        dB_given = torch.empty_like(B)
-    if dC is not None:
-        assert dC.shape == C.shape
-        dC_given = dC
-    else:
-        dC_given = torch.empty_like(C)
-    if dz is not None:
-        assert z is not None
-        assert dz.shape == z.shape
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-        ddt_given = ddt
-    else:
-        ddt_given = torch.empty_like(dt)
-    # TD: For some reason Triton (2.1.0 and 2.2.0) errors with
-    # "[CUDA]: invalid device context" (e.g. during varlne test), and cloning makes it work. Idk why.
-    dt_in = dt.clone()
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt_in,
-        A,
-        chunk_size,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    states, _ = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-    )
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    if z is not None:
-        dz, dout, dD, *rest = _chunk_scan_bwd_dz(
-            x,
-            z,
-            out,
-            dout,
-            chunk_size=chunk_size,
-            has_ddAcs=False,
-            D=D,
-            dz=dz,
-            recompute_output=recompute_output,
-        )
-        outz = rest[0] if recompute_output else out
-    else:
-        dz = None
-        outz = out
-    dstates = _chunk_scan_bwd_dstates(
-        C, dA_cumsum, dout, seq_idx=seq_idx, dtype=states.dtype
-    )
-    # dstates has length nchunks, containing the gradient to initial states at index 0 and
-    # gradient to the states of chunk (nchunks - 2) at index (nchunks - 1)
-    # Do computation in fp32 but convert dstates and states to fp16/bf16 since dstates and states
-    # will be used in matmul in the next kernels.
-    dstates, ddA_chunk_cumsum, dinitial_states, states = _state_passing_bwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        rearrange(dstates, "... p n -> ... (p n)"),
-        dfinal_states=(
-            rearrange(dfinal_states, "... p n -> ... (p n)")
-            if dfinal_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        has_initial_states=initial_states is not None,
-        dstates_dtype=x.dtype,
-        states_dtype=x.dtype,
-        chunk_size=chunk_size,
-    )
-    # dstates has length nchunks, containing the gradient to states of chunk 0 at index 0 and
-    # gradient to the final states at index (nchunks - 1)
-    # states has length nchunks, containing the initial states at index 0 and the state for chunk (nchunks - 2) at index (nchunks - 1)
-    # The final states is not stored.
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    dstates = rearrange(dstates, "... (p n) -> ... p n", n=dstate)
-    dinitial_states = (
-        rearrange(dinitial_states, "... (p n) -> ... p n", n=dstate)
-        if dinitial_states is not None
-        else None
-    )
-    dx, ddt, dD_from_x = _chunk_scan_chunk_state_bwd_dx(
-        x, dt, dA_cumsum, B, CB, dout, dstates, D=D, seq_idx=seq_idx, dx=dx
-    )
-    # dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, ngroups=ngroups)
-    dB, ddA_next = _chunk_state_bwd_db(
-        x, dt, dA_cumsum, dstates, seq_idx=seq_idx, B=B, ngroups=ngroups
-    )
-    # dC = _chunk_scan_bwd_dC(states[:, :-1].to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    dC, ddA_cumsum_prev = _chunk_scan_bwd_dC(
-        states.to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, C=C, ngroups=ngroups
-    )
-    # Computing ddA with the dcb kernel is much slower, so we're not using it for now
-    dCB = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    # dCB, ddA_tmp = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, CB=CB, ngroups=ngroups)
-    dCB = dCB.to(CB.dtype)
-    _bmm_chunk_bwd(C, dCB, residual=dB, out=dB_given)
-    _bmm_chunk_bwd(B, rearrange(dCB, "... l s -> ... s l"), residual=dC, out=dC_given)
-    # If we have z, then dout_x is recomputed in fp32 so dD = (dout_x * x).sum() is more accurate
-    # than dD_from_x = (dout_x * x).sum() where dout_x is in fp16/bf16
-    if z is None:
-        dD = dD_from_x
-    # Formula for ddA_cumsum, assuming out is the output of the forward pass before adding x * D.
-    # ddA_cumsum = torch.einsum("bclhp,bclhp->bhcl", out.float(), dout.float()) - ddt * dt
-    # However, this is numerically unstable: when we do the reverse cumsum on ddA_cumsum, there might
-    # be a lot of underflow.
-    # This is already done as part of bwd_dC kernel
-    # ddA_cumsum_prev = _chunk_scan_bwd_ddAcs_prev(states[:, :-1], C, dout, dA_cumsum, seq_idx=seq_idx)
-    ddA_cumsum_prev[..., -1] += ddA_chunk_cumsum
-    ddA_prev = ddA_cumsum_prev.flip([-1]).cumsum(dim=-1).flip([-1])
-    # This is already done as part of bwd_dB kernel
-    # ddA_next = _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=seq_idx)
-    # We don't need to pass in seq_idx because CB also zeros out entries where seq_idx[i] != seq_idx[j]
-    ddA = _chunk_scan_bwd_ddAcs_stable(x, dt, dA_cumsum, dout, CB)
-    ddA += ddA_next + ddA_prev
-    ddt_given, dA, ddt_bias = _chunk_cumsum_bwd(
-        ddA,
-        ddt,
-        dt_in,
-        A,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-        ddt=ddt_given,
-    )
-    # These 2 lines are just to test ddt and dA being computed by old code
-    # _, dA = selective_scan_bwd(dout, x, dt, A, B, C, D=D.float(), z=z)
-    # ddt_given.copy_(ddt)
-    return_vals = (
-        dx,
-        ddt_given,
-        dA,
-        dB_given,
-        dC_given,
-        dD,
-        dz,
-        ddt_bias,
-        dinitial_states,
-    )
-    return return_vals if not recompute_output else (*return_vals, outz)
-def selective_scan_bwd(dout, x, dt, A, B, C, D=None, z=None):
-    """
-    Argument:
-        dout: (batch, seqlen, nheads, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size) or (batch, nheads, headdim, nchunks, chunk_size)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    import selective_scan
-    batch, seqlen, nheads, headdim = x.shape
-    chunk_size = dt.shape[-1]
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    x = rearrange(x, "b l h p -> b (h p) l")
-    squeeze_dt = dt.dim() == 4
-    if dt.dim() == 4:
-        dt = repeat(dt, "b h c l -> b h p c l", p=headdim)
-    dt = rearrange(dt, "b h p c l -> b (h p) (c l)", p=headdim)
-    squeeze_A = A.dim() == 1
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    if dt.stride(-1) != 1:
-        dt = dt.contiguous()
-    if D is not None:
-        D = D.contiguous()
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if z is not None and z.stride(-1) != 1:
-        z = z.contiguous()
-    _, intermediate, *rest = selective_scan.fwd(
-        x, dt.to(dtype=x.dtype), A, B, C, D, z, None, False
-    )
-    if z is not None:
-        out = rest[0]
-    else:
-        out = None
-    dout = rearrange(dout, "b l h p -> b (h p) l")
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-    # backward of selective_scan with the backward of chunk).
-    # Here we just pass in None and dz will be allocated in the C++ code.
-    _, ddt, dA, *rest = selective_scan.bwd(
-        x,
-        dt.to(dtype=x.dtype),
-        A,
-        B,
-        C,
-        D,
-        z,
-        None,
-        dout,
-        intermediate,
-        out,
-        None,
-        False,
-        False,  # option to recompute out_z, not used here
-    )
-    ddt = rearrange(ddt, "b (h p) (c l) -> b h p c l", p=headdim, l=chunk_size)
-    if squeeze_dt:
-        ddt = ddt.float().sum(dim=2)
-    if squeeze_A:
-        dA = rearrange(dA, "(h p) n -> h p n", p=headdim).sum(dim=(1, 2))
-    return ddt, dA
-class MambaChunkScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D=None,
-        z=None,
-        dt_bias=None,
-        initial_states=None,
-        seq_idx=None,
-        cu_seqlens=None,
-        dt_softplus=False,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        return_varlen_states=False,
-    ):
-        ctx.dt_dtype = dt.dtype
-        if not return_varlen_states:
-            cu_seqlens = None
-        else:
-            assert (
-                cu_seqlens is not None
-            ), "cu_seqlens must be provided if return_varlen_states is True"
-        out, out_x, dt_out, dA_cumsum, states, final_states, *rest = (
-            _mamba_chunk_scan_combined_fwd(
-                x,
-                dt,
-                A,
-                B,
-                C,
-                chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                seq_idx=seq_idx,
-                cu_seqlens=cu_seqlens,
-                dt_softplus=dt_softplus,
-                dt_limit=dt_limit,
-            )
-        )
-        ctx.save_for_backward(
-            out if z is None else out_x,
-            x,
-            dt,
-            dA_cumsum,
-            A,
-            B,
-            C,
-            D,
-            z,
-            dt_bias,
-            initial_states,
-            seq_idx,
-        )
-        ctx.dt_softplus = dt_softplus
-        ctx.chunk_size = chunk_size
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.return_varlen_states = return_varlen_states
-        if not return_varlen_states:
-            return out if not return_final_states else (out, final_states)
-        else:
-            varlen_states = rest[0]
-            return (
-                (out, varlen_states)
-                if not return_final_states
-                else (out, final_states, varlen_states)
-            )
-    @staticmethod
-    def backward(ctx, dout, *args):
-        out, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx = (
-            ctx.saved_tensors
-        )
-        assert (
-            not ctx.return_varlen_states
-        ), "return_varlen_states is not supported in backward"
-        dfinal_states = args[0] if ctx.return_final_states else None
-        dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = (
-            _mamba_chunk_scan_combined_bwd(
-                dout,
-                x,
-                dt,
-                A,
-                B,
-                C,
-                out,
-                ctx.chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                dfinal_states=dfinal_states,
-                seq_idx=seq_idx,
-                dt_softplus=ctx.dt_softplus,
-                dt_limit=ctx.dt_limit,
-            )
-        )
-        return (
-            dx,
-            ddt,
-            dA,
-            dB,
-            dC,
-            None,
-            dD,
-            dz,
-            ddt_bias,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_chunk_scan_combined(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    return_varlen_states=False,
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        chunk_size: int
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen)
-        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
-        dt_softplus: Whether to apply softplus to dt
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    return MambaChunkScanCombinedFn.apply(
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D,
-        z,
-        dt_bias,
-        initial_states,
-        seq_idx,
-        cu_seqlens,
-        dt_softplus,
-        dt_limit,
-        return_final_states,
-        return_varlen_states,
-    )
-def mamba_chunk_scan(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    states = rearrange(
-        state_passing(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    # 3. Compute the output for each chunk
-    out = chunk_scan(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_chunk_scan_combined_ref(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state_ref(B, x, dt, dA_cumsum)
-    states_dtype = states.dtype
-    if states.dtype not in [torch.float32, torch.float64]:
-        states = states.to(torch.float32)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    # state_passing_ref is much less numerically stable
-    states = rearrange(
-        state_passing_ref(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    states = states.to(states_dtype)
-    # 3. Compute the output for each chunk
-    out = chunk_scan_ref(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_selective_scan(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,) or (nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    from ..selective_scan_interface import selective_scan_fn
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    x = rearrange(x, "b l h p -> b (h p) l")
-    if dt.dim() == 3:
-        dt = repeat(dt, "b l h -> b l h p", p=headdim)
-    dt = rearrange(dt, "b l h p -> b (h p) l")
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if dt_bias is not None:
-        if dt_bias.dim() == 1:
-            dt_bias = repeat(dt_bias, "h -> h p", p=headdim)
-        dt_bias = rearrange(dt_bias, "h p -> (h p)")
-    if dt_limit != (0.0, float("inf")):
-        if dt_bias is not None:
-            dt = dt + rearrange(dt_bias, "d -> d 1")
-        if dt_softplus:
-            dt = F.softplus(dt)
-        dt = dt.clamp(min=dt_limit[0], max=dt_limit[1]).to(x.dtype)
-        dt_bias = None
-        dt_softplus = None
-    out = selective_scan_fn(
-        x, dt, A, B, C, D=D, z=z, delta_bias=dt_bias, delta_softplus=dt_softplus
-    )
-    return rearrange(out, "b (h p) l -> b l h p", p=headdim)
-def mamba_conv1d_scan_ref(
-    xBC,
-    conv1d_weight,
-    conv1d_bias,
-    dt,
-    A,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    headdim=None,
-    ngroups=1,
-):
-    """
-    Argument:
-        xBC: (batch, seqlen, dim + 2 * ngroups * dstate) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, dim)
-        dt_bias: (nheads) or (nheads, headdim)
-        headdim: if D is 1D and z is None, headdim must be passed in
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    batch, seqlen, nheads = dt.shape[:3]
-    assert nheads % ngroups == 0
-    if z is not None:
-        dim = z.shape[-1]
-        assert dim % nheads == 0
-        headdim = dim // nheads
-    else:
-        if D.dim() == 1:
-            assert headdim is not None
-        else:
-            headdim = D.shape[1]
-        dim = nheads * headdim
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    dstate = (xBC.shape[-1] - dim) // ngroups // 2
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    return rearrange(out, "b s h p -> b s (h p)")
-class MambaSplitConv1dScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states=None,
-        seq_idx=None,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        activation="silu",
-        rmsnorm_weight=None,
-        rmsnorm_eps=1e-6,
-        outproj_weight=None,
-        outproj_bias=None,
-        headdim=None,
-        ngroups=1,
-        norm_before_gate=True,
-    ):
-        assert activation in [None, "silu", "swish"]
-        if D.dim() == 1:
-            assert headdim is not None
-            (nheads,) = D.shape
-        else:
-            nheads, headdim = D.shape
-        batch, seqlen, _ = zxbcdt.shape
-        dim = nheads * headdim
-        assert nheads % ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        assert zxbcdt.shape == (
-            batch,
-            seqlen,
-            2 * d_nonssm + 2 * dim + 2 * ngroups * dstate + nheads,
-        )
-        assert dt_bias.shape == (nheads,)
-        assert A.shape == (nheads,)
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + ngroups * dstate * 2, nheads], dim=-1
-        )
-        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ngroups * dstate, ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-        if rmsnorm_weight is None:
-            out, out_x, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            out = rearrange(out, "b s h p -> b s (h p)")
-            rstd = None
-            if d_nonssm > 0:
-                out = torch.cat([_swiglu_fwd(zx0), out], dim=-1)
-        else:
-            out_x, _, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            # reshape input data into 2D tensor
-            x_rms = rearrange(out_x, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            rmsnorm_weight = rmsnorm_weight.contiguous()
-            if d_nonssm == 0:
-                out = None
-            else:
-                out01 = torch.empty(
-                    (batch, seqlen, d_nonssm + dim),
-                    dtype=x_rms.dtype,
-                    device=x_rms.device,
-                )
-                out = rearrange(out01[..., d_nonssm:], "b s d -> (b s) d")
-                _swiglu_fwd(zx0, out=out01[..., :d_nonssm])
-            out, _, rstd = _layer_norm_fwd(
-                x_rms,
-                rmsnorm_weight,
-                None,
-                rmsnorm_eps,
-                z_rms,
-                out=out,
-                group_size=dim // ngroups,
-                norm_before_gate=norm_before_gate,
-                is_rms_norm=True,
-            )
-            if d_nonssm == 0:
-                out = rearrange(out, "(b s) d -> b s d", b=batch)
-            else:
-                out = out01
-        ctx.outproj_weight_dtype = (
-            outproj_weight.dtype if outproj_weight is not None else None
-        )
-        if outproj_weight is not None:
-            if torch.is_autocast_enabled():
-                dtype = torch.get_autocast_gpu_dtype()
-                out, outproj_weight = out.to(dtype), outproj_weight.to(dtype)
-                outproj_bias = (
-                    outproj_bias.to(dtype) if outproj_bias is not None else None
-                )
-            out = F.linear(out, outproj_weight, outproj_bias)
-        else:
-            assert outproj_bias is None
-        ctx.save_for_backward(
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out_x,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        )
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.activation = activation
-        ctx.rmsnorm_eps = rmsnorm_eps
-        ctx.norm_before_gate = norm_before_gate
-        ctx.chunk_size = chunk_size
-        ctx.headdim = headdim
-        ctx.ngroups = ngroups
-        return out if not return_final_states else (out, final_states)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        (
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        ) = ctx.saved_tensors
-        dfinal_states = args[0] if ctx.return_final_states else None
-        headdim = ctx.headdim
-        nheads = D.shape[0]
-        dim = nheads * headdim
-        assert nheads % ctx.ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ctx.ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ctx.ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        recompute_output = outproj_weight is not None
-        if recompute_output:
-            out_recompute = torch.empty(
-                *out.shape[:2], d_nonssm + dim, device=out.device, dtype=out.dtype
-            )
-            out0_recompute, out1_recompute = out_recompute.split(
-                [d_nonssm, dim], dim=-1
-            )
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        # Recompute x, B, C
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                ctx.activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ctx.ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dzxbcdt = torch.empty_like(zxbcdt)
-        dzx0, dz, dxBC_given, ddt_given = torch.split(
-            dzxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        dxBC = torch.empty_like(xBC)
-        dx, dB, dC = torch.split(
-            dxBC, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-        dx = rearrange(dx, "b l (h p) -> b l h p", h=nheads)
-        dB = rearrange(dB, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dC = rearrange(dC, "b l (g n) -> b l g n", g=ctx.ngroups)
-        if outproj_weight is not None:
-            dout_og = dout
-            dout = F.linear(dout, outproj_weight.t())
-        if d_nonssm > 0:
-            dout0, dout = dout.split([d_nonssm, dim], dim=-1)
-            _swiglu_bwd(zx0, dout0, dxy=dzx0, recompute_output=True, out=out0_recompute)
-        dout = rearrange(dout, "b s (h p) -> b s h p", p=headdim)
-        if rmsnorm_weight is None:
-            dz = rearrange(dz, "b l (h p) -> b l h p", h=nheads)
-            dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states, *rest = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                    dz=dz,
-                    recompute_output=recompute_output,
-                )
-            )
-            out_for_linear = (
-                rearrange(rest[0], "b s h p -> b s (h p)") if recompute_output else None
-            )
-            drmsnorm_weight = None
-        else:
-            batch = dout.shape[0]
-            dy_rms = rearrange(dout, "b s h p -> (b s) (h p)")
-            dz = rearrange(dz, "b l d -> (b l) d")
-            x_rms = rearrange(out, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            out1_recompute = (
-                rearrange(out1_recompute, "b s d -> (b s) d")
-                if recompute_output
-                else None
-            )
-            dout, drmsnorm_weight, _, dz, *rest = _layer_norm_bwd(
-                dy_rms,
-                x_rms,
-                rmsnorm_weight,
-                None,
-                ctx.rmsnorm_eps,
-                None,
-                rstd,
-                z_rms,
-                group_size=dim // ctx.ngroups,
-                norm_before_gate=ctx.norm_before_gate,
-                is_rms_norm=True,
-                recompute_output=recompute_output,
-                dz=dz,
-                out=out1_recompute if recompute_output else None,
-            )
-            out_for_linear = out_recompute if recompute_output else None
-            dout = rearrange(dout, "(b s) (h p) -> b s h p", b=batch, p=headdim)
-            dx, ddt, dA, dB, dC, dD, _, ddt_bias, dinitial_states = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                )
-            )
-        if outproj_weight is not None:
-            doutproj_weight = torch.einsum("bso,bsd->od", dout_og, out_for_linear)
-            doutproj_bias = (
-                dout_og.sum(dim=(0, 1)) if outproj_bias is not None else None
-            )
-        else:
-            doutproj_weight, doutproj_bias = None, None
-        dxBC_given = rearrange(dxBC_given, "b s d -> b d s")
-        dxBC_given, dweight, dbias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            rearrange(dxBC, "b s d -> b d s"),
-            seq_idx,
-            None,
-            None,
-            dxBC_given,
-            False,
-            ctx.activation in ["silu", "swish"],
-        )
-        dxBC_given = rearrange(dxBC_given, "b d s -> b s d")
-        return (
-            dzxbcdt,
-            dweight,
-            dbias,
-            ddt_bias,
-            dA,
-            dD,
-            None,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            drmsnorm_weight,
-            None,
-            doutproj_weight,
-            doutproj_bias,
-            None,
-            None,
-            None,
-        )
-def mamba_split_conv1d_scan_combined(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    initial_states=None,
-    seq_idx=None,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen), int32
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    return MambaSplitConv1dScanCombinedFn.apply(
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states,
-        seq_idx,
-        dt_limit,
-        return_final_states,
-        activation,
-        rmsnorm_weight,
-        rmsnorm_eps,
-        outproj_weight,
-        outproj_bias,
-        headdim,
-        ngroups,
-        norm_before_gate,
-    )
-def mamba_split_conv1d_scan_ref(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    if D.dim() == 1:
-        assert headdim is not None
-        (nheads,) = D.shape
-    else:
-        nheads, headdim = D.shape
-    assert nheads % ngroups == 0
-    batch, seqlen, _ = zxbcdt.shape
-    dim = nheads * headdim
-    dstate = (zxbcdt.shape[-1] - 2 * dim - nheads) // ngroups // 2
-    assert zxbcdt.shape == (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads)
-    assert dt_bias.shape == (nheads,)
-    assert A.shape == (nheads,)
-    if rmsnorm_weight is not None:
-        assert rmsnorm_weight.shape == (dim,)
-    z, xBC, dt = torch.split(zxbcdt, [dim, dim + 2 * ngroups * dstate, nheads], dim=-1)
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z if rmsnorm_weight is None else None,
-        dt_bias=dt_bias,
-        dt_softplus=True,
-        dt_limit=dt_limit,
-    )
-    out = rearrange(out, "b s h p -> b s (h p)")
-    if rmsnorm_weight is not None:
-        out = rmsnorm_fn(
-            out,
-            rmsnorm_weight,
-            None,
-            z=rearrange(z, "b l h p -> b l (h p)"),
-            eps=rmsnorm_eps,
-            norm_before_gate=norm_before_gate,
-        )
-    if outproj_weight is not None:
-        out = F.linear(out, outproj_weight, outproj_bias)
-    return out

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-__version__ = "2.2.4"
-from .ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-from .modules.mamba_simple import Mamba
-from .modules.mamba2 import Mamba2
-from .models.mixer_seq_simple import MambaLMHeadModel
-__all__ = [
-    "selective_scan_fn",
-    "mamba_inner_fn",
-    "Mamba",
-    "Mamba2",
-    "MambaLMHeadModel",
-]

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/distributed/__init__.py DELETED Viewed

File without changes

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py DELETED Viewed

@@ -1,326 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from ..utils.torch import custom_bwd, custom_fwd
-from einops import rearrange
-from ..distributed.distributed_utils import (
-    all_gather_raw,
-    all_reduce,
-    all_reduce_raw,
-    reduce_scatter,
-    reduce_scatter_raw,
-)
-class ParallelLinearFunc(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
-        """
-        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
-        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
-        """
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.process_group = process_group
-        ctx.sequence_parallel = sequence_parallel
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        if process_group is not None and sequence_parallel:
-            # We want to kick off the all_gather early, before weight dtype conversion
-            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-        else:
-            total_x = x
-        if torch.is_autocast_enabled():
-            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
-            bias = (
-                bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if bias is not None
-                else None
-            )
-        weight = weight.contiguous()
-        if process_group is not None and sequence_parallel:
-            handle_x.wait()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        output = F.linear(total_x, weight, bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        grad_output = grad_output.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-            else:
-                total_x = x
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            grad_input = F.linear(grad_output, weight.t())
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(
-                    grad_input, process_group, async_op=True
-                )
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            if process_group is not None and sequence_parallel:
-                handle_x.wait()
-            grad_weight = torch.einsum(
-                "bo,bi->oi", grad_output, total_x.reshape(batch_dim, total_x.shape[-1])
-            )
-        else:
-            grad_weight = None
-        grad_bias = grad_output.sum(dim=0) if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None
-def parallel_linear_func(
-    x: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    process_group: Optional[ProcessGroup] = None,
-    sequence_parallel: bool = True,
-):
-    return ParallelLinearFunc.apply(x, weight, bias, process_group, sequence_parallel)
-class ColumnParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        if out_features % multiple_of:
-            raise ValueError(
-                f"out_features ({out_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = out_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        super().__init__(
-            in_features,
-            local_multiple * multiple_of,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        return parallel_linear_func(
-            x,
-            self.weight,
-            self.bias,
-            process_group=self.process_group,
-            sequence_parallel=self.sequence_parallel,
-        )
-class RowParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        rank = torch.distributed.get_rank(process_group)
-        if in_features % multiple_of:
-            raise ValueError(
-                f"in_features ({in_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = in_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        # Only rank 0 will have bias
-        super().__init__(
-            local_multiple * multiple_of,
-            out_features,
-            bias=bias and rank == 0,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        """
-        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
-        a reduce_scatter of the result.
-        """
-        out = parallel_linear_func(x, self.weight, self.bias)
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return reduce_fn(out, self.process_group)
-class VocabParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if num_embeddings % world_size != 0:
-                raise ValueError(
-                    f"num_embeddings ({num_embeddings}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-            if world_size > 1 and padding_idx is not None:
-                raise RuntimeError("ParallelEmbedding does not support padding_idx")
-        else:
-            world_size = 1
-        super().__init__(
-            num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs
-        )
-    def forward(self, input: Tensor) -> Tensor:
-        if self.process_group is None:
-            return super().forward(input)
-        else:
-            rank = torch.distributed.get_rank(self.process_group)
-            vocab_size = self.num_embeddings
-            vocab_start_index, vocab_end_index = (
-                rank * vocab_size,
-                (rank + 1) * vocab_size,
-            )
-            # Create a mask of valid vocab ids (1 means it needs to be masked).
-            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
-            input = input - vocab_start_index
-            input[input_ids_mask] = 0
-            embeddings = super().forward(input)
-            embeddings[input_ids_mask] = 0.0
-            return embeddings
-class ColumnParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if embedding_dim % world_size != 0:
-                raise ValueError(
-                    f"embedding_dim ({embedding_dim}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-        else:
-            world_size = 1
-        super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs)
-class ParallelEmbeddings(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        vocab_size,
-        max_position_embeddings,
-        process_group,
-        padding_idx=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        """
-        If max_position_embeddings <= 0, there's no position embeddings
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.word_embeddings = VocabParallelEmbedding(
-            vocab_size,
-            embed_dim,
-            padding_idx=padding_idx,
-            process_group=process_group,
-            **factory_kwargs,
-        )
-        self.max_position_embeddings = max_position_embeddings
-        if self.max_position_embeddings > 0:
-            self.position_embeddings = ColumnParallelEmbedding(
-                max_position_embeddings,
-                embed_dim,
-                process_group=process_group,
-                **factory_kwargs,
-            )
-    def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False):
-        """
-        input_ids: (batch, seqlen)
-        position_ids: (batch, seqlen)
-        """
-        batch_size, seqlen = input_ids.shape
-        world_size = torch.distributed.get_world_size(self.process_group)
-        embeddings = self.word_embeddings(input_ids)
-        if self.max_position_embeddings > 0:
-            if position_ids is None:
-                position_ids = torch.arange(
-                    seqlen, dtype=torch.long, device=input_ids.device
-                )
-            position_embeddings = self.position_embeddings(position_ids)
-            if world_size <= 1:
-                embeddings = embeddings + position_embeddings
-            else:
-                partition_dim = self.position_embeddings.embedding_dim
-                rank = torch.distributed.get_rank(self.process_group)
-                embeddings[
-                    ..., rank * partition_dim : (rank + 1) * partition_dim
-                ] += position_embeddings
-        if combine_batch_seqlen_dim:
-            embeddings = rearrange(embeddings, "b s d -> (b s) d")
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return (
-            embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group)
-        )

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/models/__init__.py DELETED Viewed

File without changes

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py DELETED Viewed

@@ -1,338 +0,0 @@
-# Copyright (c) 2023, Albert Gu, Tri Dao.
-import math
-from functools import partial
-import json
-import os
-import copy
-from collections import namedtuple
-import torch
-import torch.nn as nn
-from .config_mamba import MambaConfig
-from ..modules.mamba_simple import Mamba
-from ..modules.mamba2 import Mamba2
-from ..modules.mha import MHA
-from ..modules.mlp import GatedMLP
-from ..modules.block import Block
-from ..utils.generation import GenerationMixin
-from ..utils.hf import load_config_hf, load_state_dict_hf
-try:
-    from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-def create_block(
-    d_model,
-    d_intermediate,
-    ssm_cfg=None,
-    attn_layer_idx=None,
-    attn_cfg=None,
-    norm_epsilon=1e-5,
-    rms_norm=False,
-    residual_in_fp32=False,
-    fused_add_norm=False,
-    layer_idx=None,
-    device=None,
-    dtype=None,
-):
-    if ssm_cfg is None:
-        ssm_cfg = {}
-    if attn_layer_idx is None:
-        attn_layer_idx = []
-    if attn_cfg is None:
-        attn_cfg = {}
-    factory_kwargs = {"device": device, "dtype": dtype}
-    if layer_idx not in attn_layer_idx:
-        # Create a copy of the config to modify
-        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
-        ssm_layer = ssm_cfg.pop("layer", "Mamba1")
-        if ssm_layer not in ["Mamba1", "Mamba2"]:
-            raise ValueError(
-                f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2"
-            )
-        mixer_cls = partial(
-            Mamba2 if ssm_layer == "Mamba2" else Mamba,
-            layer_idx=layer_idx,
-            **ssm_cfg,
-            **factory_kwargs,
-        )
-    else:
-        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
-    norm_cls = partial(
-        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
-    )
-    if d_intermediate == 0:
-        mlp_cls = nn.Identity
-    else:
-        mlp_cls = partial(
-            GatedMLP,
-            hidden_features=d_intermediate,
-            out_features=d_model,
-            **factory_kwargs,
-        )
-    block = Block(
-        d_model,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        fused_add_norm=fused_add_norm,
-        residual_in_fp32=residual_in_fp32,
-    )
-    block.layer_idx = layer_idx
-    return block
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    if isinstance(module, nn.Linear):
-        if module.bias is not None:
-            if not getattr(module.bias, "_no_reinit", False):
-                nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                # We need to reinit p since this code could be called multiple times
-                # Having just p *= scale would repeatedly scale it down
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                with torch.no_grad():
-                    p /= math.sqrt(n_residuals_per_layer * n_layer)
-class MixerModel(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        d_intermediate: int,
-        vocab_size: int,
-        ssm_cfg=None,
-        attn_layer_idx=None,
-        attn_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    d_intermediate=d_intermediate,
-                    ssm_cfg=ssm_cfg,
-                    attn_layer_idx=attn_layer_idx,
-                    attn_cfg=attn_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-                n_residuals_per_layer=(
-                    1 if d_intermediate == 0 else 2
-                ),  # 2 if we have MLP
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(
-                batch_size, max_seqlen, dtype=dtype, **kwargs
-            )
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
-        hidden_states = self.embedding(input_ids)
-        residual = None
-        for layer in self.layers:
-            hidden_states, residual = layer(
-                hidden_states,
-                residual,
-                inference_params=inference_params,
-                **mixer_kwargs,
-            )
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            # Set prenorm=False here since we don't need the residual
-            hidden_states = layer_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm_f, RMSNorm),
-            )
-        return hidden_states
-class MambaLMHeadModel(nn.Module, GenerationMixin):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        d_intermediate = config.d_intermediate
-        vocab_size = config.vocab_size
-        ssm_cfg = config.ssm_cfg
-        attn_layer_idx = config.attn_layer_idx
-        attn_cfg = config.attn_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.backbone = MixerModel(
-            d_model=d_model,
-            n_layer=n_layer,
-            d_intermediate=d_intermediate,
-            vocab_size=vocab_size,
-            ssm_cfg=ssm_cfg,
-            attn_layer_idx=attn_layer_idx,
-            attn_cfg=attn_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        if self.config.tie_embeddings:
-            self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        **mixer_kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        hidden_states = self.backbone(
-            input_ids, inference_params=inference_params, **mixer_kwargs
-        )
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(
-            load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype)
-        )
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.__dict__, f, indent=4)

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/modules/__init__.py DELETED Viewed

File without changes

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/__init__.py DELETED Viewed

File without changes

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py DELETED Viewed

@@ -1,659 +0,0 @@
-# Copyright (c) 2023, Tri Dao, Albert Gu.
-import torch
-import torch.nn.functional as F
-from ..utils.torch import custom_fwd, custom_bwd
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn = None
-    causal_conv1d_cuda = None
-from .triton.layer_norm import _layer_norm_fwd
-from .._ops import ops
-class SelectiveScanFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        u,
-        delta,
-        A,
-        B,
-        C,
-        D=None,
-        z=None,
-        delta_bias=None,
-        delta_softplus=False,
-        return_last_state=False,
-    ):
-        if u.stride(-1) != 1:
-            u = u.contiguous()
-        if delta.stride(-1) != 1:
-            delta = delta.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if C.stride(-1) != 1:
-            C = C.contiguous()
-        if z is not None and z.stride(-1) != 1:
-            z = z.contiguous()
-        if B.dim() == 3:
-            B = rearrange(B, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_B = True
-        if C.dim() == 3:
-            C = rearrange(C, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_C = True
-        out, x, *rest = ops.selective_scan_fwd(
-            u, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.has_z = z is not None
-        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
-        if not ctx.has_z:
-            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
-            return out if not return_last_state else (out, last_state)
-        else:
-            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
-            out_z = rest[0]
-            return out_z if not return_last_state else (out_z, last_state)
-    @staticmethod
-    def backward(ctx, dout, *args):
-        if not ctx.has_z:
-            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
-            z = None
-            out = None
-        else:
-            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        # Here we just pass in None and dz will be allocated in the C++ code.
-        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = ops.selective_scan_bwd(
-            u,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            z,
-            delta_bias,
-            dout,
-            x,
-            out,
-            None,
-            ctx.delta_softplus,
-            False,  # option to recompute out_z, not used here
-        )
-        dz = rest[0] if ctx.has_z else None
-        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
-        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
-        return (
-            du,
-            ddelta,
-            dA,
-            dB,
-            dC,
-            dD if D is not None else None,
-            dz,
-            ddelta_bias if delta_bias is not None else None,
-            None,
-            None,
-        )
-def rms_norm_forward(
-    x,
-    weight,
-    bias,
-    eps=1e-6,
-    is_rms_norm=True,
-):
-    # x (b l) d
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    weight = weight.contiguous()
-    if bias is not None:
-        bias = bias.contiguous()
-    y = _layer_norm_fwd(
-        x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm
-    )[0]
-    # y (b l) d
-    return y
-def selective_scan_fn(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """if return_last_state is True, returns (out, last_state)
-    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
-    not considered in the backward pass.
-    """
-    return SelectiveScanFn.apply(
-        u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state
-    )
-def selective_scan_ref(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """
-    u: r(B D L)
-    delta: r(B D L)
-    A: c(D N) or r(D N)
-    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    D: r(D)
-    z: r(B D L)
-    delta_bias: r(D), fp32
-    out: r(B D L)
-    last_state (optional): r(B D dstate) or c(B D dstate)
-    """
-    dtype_in = u.dtype
-    u = u.float()
-    delta = delta.float()
-    if delta_bias is not None:
-        delta = delta + delta_bias[..., None].float()
-    if delta_softplus:
-        delta = F.softplus(delta)
-    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
-    is_variable_B = B.dim() >= 3
-    is_variable_C = C.dim() >= 3
-    if A.is_complex():
-        if is_variable_B:
-            B = torch.view_as_complex(
-                rearrange(B.float(), "... (L two) -> ... L two", two=2)
-            )
-        if is_variable_C:
-            C = torch.view_as_complex(
-                rearrange(C.float(), "... (L two) -> ... L two", two=2)
-            )
-    else:
-        B = B.float()
-        C = C.float()
-    x = A.new_zeros((batch, dim, dstate))
-    ys = []
-    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
-    if not is_variable_B:
-        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
-    else:
-        if B.dim() == 3:
-            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
-        else:
-            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
-    if is_variable_C and C.dim() == 4:
-        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
-    last_state = None
-    for i in range(u.shape[2]):
-        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
-        if not is_variable_C:
-            y = torch.einsum("bdn,dn->bd", x, C)
-        else:
-            if C.dim() == 3:
-                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
-            else:
-                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
-        if i == u.shape[2] - 1:
-            last_state = x
-        if y.is_complex():
-            y = y.real * 2
-        ys.append(y)
-    y = torch.stack(ys, dim=2)  # (batch dim L)
-    out = y if D is None else y + u * rearrange(D, "d -> d 1")
-    if z is not None:
-        out = out * F.silu(z)
-    out = out.to(dtype=dtype_in)
-    return out if not return_last_state else (out, last_state)
-class MambaInnerFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B=None,
-        C=None,
-        D=None,
-        delta_bias=None,
-        B_proj_bias=None,
-        C_proj_bias=None,
-        delta_softplus=True,
-        checkpoint_lvl=1,
-        b_rms_weight=None,
-        c_rms_weight=None,
-        dt_rms_weight=None,
-        b_c_dt_rms_eps=1e-6,
-    ):
-        """
-        xz: (batch, dim, seqlen)
-        """
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        assert checkpoint_lvl in [0, 1]
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        if torch.is_autocast_enabled():
-            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            delta_proj_weight = delta_proj_weight.to(
-                dtype=torch.get_autocast_gpu_dtype()
-            )
-            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            out_proj_bias = (
-                out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if out_proj_bias is not None
-                else None
-            )
-        if xz.stride(-1) != 1:
-            xz = xz.contiguous()
-        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
-        x, z = xz.chunk(2, dim=1)
-        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
-        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-            x, conv1d_weight, conv1d_bias, None, None, None, True
-        )
-        # We're being very careful here about the layout, to avoid extra transposes.
-        # We want delta to have d as the slowest moving dimension
-        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-        x_dbl = F.linear(
-            rearrange(conv1d_out, "b d l -> (b l) d"), x_proj_weight
-        )  # (bl d)
-        delta = rearrange(
-            delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-        )
-        ctx.is_variable_B = B is None
-        ctx.is_variable_C = C is None
-        ctx.B_proj_bias_is_None = B_proj_bias is None
-        ctx.C_proj_bias_is_None = C_proj_bias is None
-        if B is None:  # variable B
-            B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl dstate)
-            if B_proj_bias is not None:
-                B = B + B_proj_bias.to(dtype=B.dtype)
-            if not A.is_complex():
-                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                B = rearrange(
-                    B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if B.stride(-1) != 1:
-                B = B.contiguous()
-        if C is None:  # variable C
-            C = x_dbl[:, -d_state:]  # (bl dstate)
-            if C_proj_bias is not None:
-                C = C + C_proj_bias.to(dtype=C.dtype)
-            if not A.is_complex():
-                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                C = rearrange(
-                    C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if C.stride(-1) != 1:
-                C = C.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if b_rms_weight is not None:
-            B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if c_rms_weight is not None:
-            C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if dt_rms_weight is not None:
-            delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-            delta = rms_norm_forward(
-                delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps
-            )
-            delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-        out, scan_intermediates, out_z = ops.selective_scan_fwd(
-            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.out_proj_bias_is_None = out_proj_bias is None
-        ctx.checkpoint_lvl = checkpoint_lvl
-        ctx.b_rms_weight = b_rms_weight
-        ctx.c_rms_weight = c_rms_weight
-        ctx.dt_rms_weight = dt_rms_weight
-        ctx.b_c_dt_rms_eps = b_c_dt_rms_eps
-        if (
-            checkpoint_lvl >= 1
-        ):  # Will recompute conv1d_out and delta in the backward pass
-            conv1d_out, delta = None, None
-        ctx.save_for_backward(
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        )
-        return F.linear(
-            rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias
-        )
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout):
-        # dout: (batch, seqlen, dim)
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        (
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        ) = ctx.saved_tensors
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        x, z = xz.chunk(2, dim=1)
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        if ctx.checkpoint_lvl == 1:
-            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-                x, conv1d_weight, conv1d_bias, None, None, None, True
-            )
-            delta = rearrange(
-                delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-            )
-            if dt_rms_weight is not None:
-                delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-                delta = rms_norm_forward(
-                    delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps
-                )
-                delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-            if b_rms_weight is not None:
-                # Recompute & RMSNorm B
-                B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                B = rms_norm_forward(B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps)
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            if c_rms_weight is not None:
-                # Recompute & RMSNorm C
-                C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                C = rms_norm_forward(C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps)
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
-        dx, dz = dxz.chunk(2, dim=1)
-        dout = rearrange(dout, "b l e -> e (b l)")
-        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
-        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = (
-            ops.selective_scan_bwd(
-                conv1d_out,
-                delta,
-                A,
-                B,
-                C,
-                D,
-                z,
-                delta_bias,
-                dout_y,
-                scan_intermediates,
-                out,
-                dz,
-                ctx.delta_softplus,
-                True,  # option to recompute out_z
-            )
-        )
-        dout_proj_weight = torch.einsum(
-            "eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)")
-        )
-        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
-        dD = dD if D is not None else None
-        dx_dbl = torch.empty_like(x_dbl)
-        dB_proj_bias = None
-        if ctx.is_variable_B:
-            if not A.is_complex():
-                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dB = rearrange(
-                    dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
-            dx_dbl[:, delta_rank : delta_rank + d_state] = dB  # (bl d)
-            dB = None
-        dC_proj_bias = None
-        if ctx.is_variable_C:
-            if not A.is_complex():
-                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dC = rearrange(
-                    dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
-            dx_dbl[:, -d_state:] = dC  # (bl d)
-            dC = None
-        ddelta = rearrange(ddelta, "b d l -> d (b l)")
-        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
-        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
-        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
-        dx_proj_weight = torch.einsum(
-            "Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d")
-        )
-        dconv1d_out = torch.addmm(
-            dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out
-        )
-        dconv1d_out = rearrange(
-            dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1]
-        )
-        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
-        # backward of conv1d with the backward of chunk).
-        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            x,
-            conv1d_weight,
-            conv1d_bias,
-            dconv1d_out,
-            None,
-            None,
-            None,
-            dx,
-            False,
-            True,
-        )
-        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
-        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
-        return (
-            dxz,
-            dconv1d_weight,
-            dconv1d_bias,
-            dx_proj_weight,
-            ddelta_proj_weight,
-            dout_proj_weight,
-            dout_proj_bias,
-            dA,
-            dB,
-            dC,
-            dD,
-            ddelta_bias if delta_bias is not None else None,
-            # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps
-            dB_proj_bias,
-            dC_proj_bias,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_inner_fn(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-    checkpoint_lvl=1,
-    b_rms_weight=None,
-    c_rms_weight=None,
-    dt_rms_weight=None,
-    b_c_dt_rms_eps=1e-6,
-):
-    return MambaInnerFn.apply(
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B,
-        C,
-        D,
-        delta_bias,
-        B_proj_bias,
-        C_proj_bias,
-        delta_softplus,
-        checkpoint_lvl,
-        b_rms_weight,
-        c_rms_weight,
-        dt_rms_weight,
-        b_c_dt_rms_eps,
-    )
-def mamba_inner_ref(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-):
-    assert (
-        causal_conv1d_fn is not None
-    ), "causal_conv1d_fn is not available. Please install causal-conv1d."
-    L = xz.shape[-1]
-    delta_rank = delta_proj_weight.shape[1]
-    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-    x, z = xz.chunk(2, dim=1)
-    x = causal_conv1d_fn(
-        x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, activation="silu"
-    )
-    # We're being very careful here about the layout, to avoid extra transposes.
-    # We want delta to have d as the slowest moving dimension
-    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-    x_dbl = F.linear(rearrange(x, "b d l -> (b l) d"), x_proj_weight)  # (bl d)
-    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
-    delta = rearrange(delta, "d (b l) -> b d l", l=L)
-    if B is None:  # variable B
-        B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl d)
-        if B_proj_bias is not None:
-            B = B + B_proj_bias.to(dtype=B.dtype)
-        if not A.is_complex():
-            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            B = rearrange(
-                B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    if C is None:  # variable B
-        C = x_dbl[:, -d_state:]  # (bl d)
-        if C_proj_bias is not None:
-            C = C + C_proj_bias.to(dtype=C.dtype)
-        if not A.is_complex():
-            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            C = rearrange(
-                C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    y = selective_scan_fn(
-        x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True
-    )
-    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/__init__.py DELETED Viewed

File without changes

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py DELETED Viewed

@@ -1,1166 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# Implement dropout + residual + layer_norm / rms_norm.
-# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
-# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
-# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
-import math
-import warnings
-import torch
-import torch.nn.functional as F
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-def layer_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    out = F.layer_norm(
-        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
-    ).to(dtype)
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = F.layer_norm(
-            x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def rms_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
-        dtype
-    )
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = (
-            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def config_prune(configs):
-    if torch.version.hip:
-        try:
-            # set warp size based on gcn architecure
-            gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
-            if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
-                # radeon
-                warp_size = 32
-            else:
-                # instinct
-                warp_size = 64
-        except AttributeError as e:
-            # fall back to crude method to set warp size
-            device_name = torch.cuda.get_device_properties(0).name
-            if "instinct" in device_name.lower():
-                warp_size = 64
-            else:
-                warp_size = 32
-            warnings.warn(
-                f"{e}, warp size set to {warp_size} based on device name: {device_name}",
-                UserWarning,
-            )
-    else:
-        # cuda
-        warp_size = 32
-    max_block_sz = 1024
-    max_num_warps = max_block_sz // warp_size
-    pruned_configs = [config for config in configs if config.num_warps <= max_num_warps]
-    return pruned_configs
-configs_autotune = [
-    triton.Config({}, num_warps=1),
-    triton.Config({}, num_warps=2),
-    triton.Config({}, num_warps=4),
-    triton.Config({}, num_warps=8),
-    triton.Config({}, num_warps=16),
-    triton.Config({}, num_warps=32),
-]
-pruned_configs_autotune = config_prune(configs_autotune)
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
-@triton.jit
-def _layer_norm_fwd_1pass_kernel(
-    X,  # pointer to the input
-    Y,  # pointer to the output
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    RESIDUAL,  # pointer to the residual
-    X1,
-    W1,
-    B1,
-    Y1,
-    RESIDUAL_OUT,  # pointer to the residual
-    ROWSCALE,
-    SEEDS,  # Dropout seeds for each row
-    DROPOUT_MASK,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_res_row,
-    stride_res_out_row,
-    stride_x1_row,
-    stride_y1_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,  # Dropout probability
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_RESIDUAL: tl.constexpr,
-    STORE_RESIDUAL_OUT: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    STORE_DROPOUT_MASK: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_X1: tl.constexpr,
-    HAS_W1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    X += row * stride_x_row
-    Y += row * stride_y_row
-    if HAS_RESIDUAL:
-        RESIDUAL += row * stride_res_row
-    if STORE_RESIDUAL_OUT:
-        RESIDUAL_OUT += row * stride_res_out_row
-    if HAS_X1:
-        X1 += row * stride_x1_row
-    if HAS_W1:
-        Y1 += row * stride_y1_row
-    # Compute mean and variance
-    cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
-    if HAS_ROWSCALE:
-        rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-        x *= rowscale
-    if HAS_DROPOUT:
-        # Compute dropout mask
-        # 7 rounds is good enough, and reduces register pressure
-        keep_mask = (
-            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-        )
-        x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
-        if STORE_DROPOUT_MASK:
-            tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
-    if HAS_X1:
-        x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
-            x1 *= rowscale
-        if HAS_DROPOUT:
-            # Compute dropout mask
-            # 7 rounds is good enough, and reduces register pressure
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
-            if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
-        x += x1
-    if HAS_RESIDUAL:
-        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
-        x += residual
-    if STORE_RESIDUAL_OUT:
-        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
-    if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    else:
-        xbar = tl.where(cols < N, x, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
-    # Write output
-    tl.store(Y + cols, y, mask=mask)
-    if HAS_W1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if HAS_B1:
-            b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
-        y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
-        tl.store(Y1 + cols, y1, mask=mask)
-def _layer_norm_fwd(
-    x,
-    weight,
-    bias,
-    eps,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    dropout_p=0.0,
-    rowscale=None,
-    out_dtype=None,
-    residual_dtype=None,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    if residual is not None:
-        residual_dtype = residual.dtype
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    if residual is not None:
-        assert residual.stride(-1) == 1
-        assert residual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if x1 is not None:
-        assert x1.shape == x.shape
-        assert rowscale is None
-        assert x1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
-    assert y.stride(-1) == 1
-    if weight1 is not None:
-        y1 = torch.empty_like(y)
-        assert y1.stride(-1) == 1
-    else:
-        y1 = None
-    if (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        residual_out = torch.empty(
-            M,
-            N,
-            device=x.device,
-            dtype=residual_dtype if residual_dtype is not None else x.dtype,
-        )
-        assert residual_out.stride(-1) == 1
-    else:
-        residual_out = None
-    mean = (
-        torch.empty((M,), dtype=torch.float32, device=x.device)
-        if not is_rms_norm
-        else None
-    )
-    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
-    if dropout_p > 0.0:
-        seeds = torch.randint(
-            2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
-        )
-    else:
-        seeds = None
-    if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(
-            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
-        )
-    else:
-        dropout_mask = None
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[(M,)](
-            x,
-            y,
-            weight,
-            bias,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            y1,
-            residual_out,
-            rowscale,
-            seeds,
-            dropout_mask,
-            mean,
-            rstd,
-            x.stride(0),
-            y.stride(0),
-            residual.stride(0) if residual is not None else 0,
-            residual_out.stride(0) if residual_out is not None else 0,
-            x1.stride(0) if x1 is not None else 0,
-            y1.stride(0) if y1 is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            is_rms_norm,
-            BLOCK_N,
-            residual is not None,
-            residual_out is not None,
-            bias is not None,
-            dropout_p > 0.0,
-            dropout_mask is not None,
-            rowscale is not None,
-        )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if dropout_mask is not None and x1 is not None:
-        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
-    else:
-        dropout_mask1 = None
-    return (
-        y,
-        y1,
-        mean,
-        rstd,
-        residual_out if residual_out is not None else x,
-        seeds,
-        dropout_mask,
-        dropout_mask1,
-    )
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=[
-        "N",
-        "HAS_DRESIDUAL",
-        "STORE_DRESIDUAL",
-        "IS_RMS_NORM",
-        "HAS_BIAS",
-        "HAS_DROPOUT",
-    ],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
-# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
-@triton.jit
-def _layer_norm_bwd_kernel(
-    X,  # pointer to the input
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    Y,  # pointer to the output to be recomputed
-    DY,  # pointer to the output gradient
-    DX,  # pointer to the input gradient
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    DRESIDUAL,
-    W1,
-    DY1,
-    DX1,
-    DW1,
-    DB1,
-    DRESIDUAL_IN,
-    ROWSCALE,
-    SEEDS,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_dy_row,
-    stride_dx_row,
-    stride_dres_row,
-    stride_dy1_row,
-    stride_dx1_row,
-    stride_dres_in_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,
-    rows_per_program,
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_DRESIDUAL: tl.constexpr,
-    STORE_DRESIDUAL: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_DY1: tl.constexpr,
-    HAS_DX1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-    RECOMPUTE_OUTPUT: tl.constexpr,
-):
-    # Map the program id to the elements of X, DX, and DY it should compute.
-    row_block_id = tl.program_id(0)
-    row_start = row_block_id * rows_per_program
-    # Do not early exit if row_start >= M, because we need to write DW and DB
-    cols = tl.arange(0, BLOCK_N)
-    mask = cols < N
-    X += row_start * stride_x_row
-    if HAS_DRESIDUAL:
-        DRESIDUAL += row_start * stride_dres_row
-    if STORE_DRESIDUAL:
-        DRESIDUAL_IN += row_start * stride_dres_in_row
-    DY += row_start * stride_dy_row
-    DX += row_start * stride_dx_row
-    if HAS_DY1:
-        DY1 += row_start * stride_dy1_row
-    if HAS_DX1:
-        DX1 += row_start * stride_dx1_row
-    if RECOMPUTE_OUTPUT:
-        Y += row_start * stride_y_row
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if RECOMPUTE_OUTPUT and HAS_BIAS:
-        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
-    if HAS_DY1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_BIAS:
-        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_DY1:
-        dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-        if HAS_B1:
-            db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    row_end = min((row_block_id + 1) * rows_per_program, M)
-    for row in range(row_start, row_end):
-        # Load data to SRAM
-        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
-        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
-        if HAS_DY1:
-            dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
-        if not IS_RMS_NORM:
-            mean = tl.load(Mean + row)
-        rstd = tl.load(Rstd + row)
-        # Compute dx
-        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-        xhat = tl.where(mask, xhat, 0.0)
-        if RECOMPUTE_OUTPUT:
-            y = xhat * w + b if HAS_BIAS else xhat * w
-            tl.store(Y + cols, y, mask=mask)
-        wdy = w * dy
-        dw += dy * xhat
-        if HAS_BIAS:
-            db += dy
-        if HAS_DY1:
-            wdy += w1 * dy1
-            dw1 += dy1 * xhat
-            if HAS_B1:
-                db1 += dy1
-        if not IS_RMS_NORM:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            c2 = tl.sum(wdy, axis=0) / N
-            dx = (wdy - (xhat * c1 + c2)) * rstd
-        else:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            dx = (wdy - xhat * c1) * rstd
-        if HAS_DRESIDUAL:
-            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
-            dx += dres
-        # Write dx
-        if STORE_DRESIDUAL:
-            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
-        if HAS_DX1:
-            if HAS_DROPOUT:
-                keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                    > dropout_p
-                )
-                dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-            else:
-                dx1 = dx
-            tl.store(DX1 + cols, dx1, mask=mask)
-        if HAS_DROPOUT:
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-            dx *= rowscale
-        tl.store(DX + cols, dx, mask=mask)
-        X += stride_x_row
-        if HAS_DRESIDUAL:
-            DRESIDUAL += stride_dres_row
-        if STORE_DRESIDUAL:
-            DRESIDUAL_IN += stride_dres_in_row
-        if RECOMPUTE_OUTPUT:
-            Y += stride_y_row
-        DY += stride_dy_row
-        DX += stride_dx_row
-        if HAS_DY1:
-            DY1 += stride_dy1_row
-        if HAS_DX1:
-            DX1 += stride_dx1_row
-    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
-    if HAS_BIAS:
-        tl.store(DB + row_block_id * N + cols, db, mask=mask)
-    if HAS_DY1:
-        tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
-        if HAS_B1:
-            tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
-def _layer_norm_bwd(
-    dy,
-    x,
-    weight,
-    bias,
-    eps,
-    mean,
-    rstd,
-    dresidual=None,
-    dy1=None,
-    weight1=None,
-    bias1=None,
-    seeds=None,
-    dropout_p=0.0,
-    rowscale=None,
-    has_residual=False,
-    has_x1=False,
-    is_rms_norm=False,
-    x_dtype=None,
-    recompute_output=False,
-):
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    assert dy.stride(-1) == 1
-    assert dy.shape == (M, N)
-    if dresidual is not None:
-        assert dresidual.stride(-1) == 1
-        assert dresidual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if dy1 is not None:
-        assert weight1 is not None
-        assert dy1.shape == dy.shape
-        assert dy1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if seeds is not None:
-        assert seeds.is_contiguous()
-        assert seeds.shape == (M if not has_x1 else M * 2,)
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    dx = (
-        torch.empty_like(x)
-        if x_dtype is None
-        else torch.empty(M, N, dtype=x_dtype, device=x.device)
-    )
-    dresidual_in = (
-        torch.empty_like(x)
-        if has_residual
-        and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
-        else None
-    )
-    dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = (
-        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
-        if recompute_output
-        else None
-    )
-    if recompute_output:
-        assert (
-            weight1 is None
-        ), "recompute_output is not supported with parallel LayerNorm"
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
-    _db = (
-        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
-        if bias is not None
-        else None
-    )
-    _dw1 = torch.empty_like(_dw) if weight1 is not None else None
-    _db1 = torch.empty_like(_db) if bias1 is not None else None
-    rows_per_program = math.ceil(M / sm_count)
-    grid = (sm_count,)
-    with torch.cuda.device(x.device.index):
-        _layer_norm_bwd_kernel[grid](
-            x,
-            weight,
-            bias,
-            y,
-            dy,
-            dx,
-            _dw,
-            _db,
-            dresidual,
-            weight1,
-            dy1,
-            dx1,
-            _dw1,
-            _db1,
-            dresidual_in,
-            rowscale,
-            seeds,
-            mean,
-            rstd,
-            x.stride(0),
-            0 if not recompute_output else y.stride(0),
-            dy.stride(0),
-            dx.stride(0),
-            dresidual.stride(0) if dresidual is not None else 0,
-            dy1.stride(0) if dy1 is not None else 0,
-            dx1.stride(0) if dx1 is not None else 0,
-            dresidual_in.stride(0) if dresidual_in is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            rows_per_program,
-            is_rms_norm,
-            BLOCK_N,
-            dresidual is not None,
-            dresidual_in is not None,
-            bias is not None,
-            dropout_p > 0.0,
-        )
-    dw = _dw.sum(0).to(weight.dtype)
-    db = _db.sum(0).to(bias.dtype) if bias is not None else None
-    dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
-    db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return (
-        (dx, dw, db, dresidual_in, dx1, dw1, db1)
-        if not recompute_output
-        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
-    )
-class LayerNormFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        weight,
-        bias,
-        residual=None,
-        x1=None,
-        weight1=None,
-        bias1=None,
-        eps=1e-6,
-        dropout_p=0.0,
-        rowscale=None,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-        return_dropout_mask=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        if x1 is not None:
-            assert x1.shape == x_shape_og
-            assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = x1.reshape(-1, x1.shape[-1])
-            if x1.stride(-1) != 1:
-                x1 = x1.contiguous()
-        weight = weight.contiguous()
-        if bias is not None:
-            bias = bias.contiguous()
-        if weight1 is not None:
-            weight1 = weight1.contiguous()
-        if bias1 is not None:
-            bias1 = bias1.contiguous()
-        if rowscale is not None:
-            rowscale = rowscale.reshape(-1).contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
-            _layer_norm_fwd(
-                x,
-                weight,
-                bias,
-                eps,
-                residual,
-                x1,
-                weight1,
-                bias1,
-                dropout_p=dropout_p,
-                rowscale=rowscale,
-                residual_dtype=residual_dtype,
-                is_rms_norm=is_rms_norm,
-                return_dropout_mask=return_dropout_mask,
-            )
-        )
-        ctx.save_for_backward(
-            residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.dropout_p = dropout_p
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.has_x1 = x1 is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        y = y.reshape(x_shape_og)
-        y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = (
-            residual_out.reshape(x_shape_og) if residual_out is not None else None
-        )
-        dropout_mask = (
-            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        )
-        dropout_mask1 = (
-            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
-        )
-        if not return_dropout_mask:
-            if weight1 is None:
-                return y if not prenorm else (y, residual_out)
-            else:
-                return (y, y1) if not prenorm else (y, y1, residual_out)
-        else:
-            if weight1 is None:
-                return (
-                    (y, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, residual_out, dropout_mask, dropout_mask1)
-                )
-            else:
-                return (
-                    (y, y1, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, y1, residual_out, dropout_mask, dropout_mask1)
-                )
-    @staticmethod
-    def backward(ctx, dy, *args):
-        x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
-        dy = dy.reshape(-1, dy.shape[-1])
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if weight1 is not None:
-            dy1, args = args[0], args[1:]
-            dy1 = dy1.reshape(-1, dy1.shape[-1])
-            if dy1.stride(-1) != 1:
-                dy1 = dy1.contiguous()
-            assert dy1.shape == x.shape
-        else:
-            dy1 = None
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
-            dy,
-            x,
-            weight,
-            bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual,
-            dy1,
-            weight1,
-            bias1,
-            seeds,
-            ctx.dropout_p,
-            rowscale,
-            ctx.has_residual,
-            ctx.has_x1,
-            ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-        )
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dw,
-            db,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
-            dw1,
-            db1,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-        return_dropout_mask,
-    )
-def rms_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        True,
-        return_dropout_mask,
-    )
-class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if dropout_p > 0.0:
-            self.drop = torch.nn.Dropout(dropout_p)
-        else:
-            self.drop = None
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
-    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
-        return rms_norm_fn(
-            x,
-            self.weight,
-            self.bias,
-            residual=residual,
-            eps=self.eps,
-            dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
-            prenorm=prenorm,
-            residual_in_fp32=residual_in_fp32,
-        )
-class LayerNormLinearFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual=None,
-        eps=1e-6,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        norm_weight = norm_weight.contiguous()
-        if norm_bias is not None:
-            norm_bias = norm_bias.contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd(
-            x,
-            norm_weight,
-            norm_bias,
-            eps,
-            residual,
-            out_dtype=(
-                None
-                if not torch.is_autocast_enabled()
-                else torch.get_autocast_gpu_dtype()
-            ),
-            residual_dtype=residual_dtype,
-            is_rms_norm=is_rms_norm,
-        )
-        y = y.reshape(x_shape_og)
-        dtype = (
-            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
-        )
-        linear_weight = linear_weight.to(dtype)
-        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
-        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
-        # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(
-            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        ctx.linear_bias_is_none = linear_bias is None
-        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
-        dout = dout.reshape(-1, dout.shape[-1])
-        dy = F.linear(dout, linear_weight.t())
-        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = _layer_norm_bwd(
-            dy,
-            x,
-            norm_weight,
-            norm_bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual=dresidual,
-            has_residual=ctx.has_residual,
-            is_rms_norm=ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-            recompute_output=True,
-        )
-        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dnorm_weight,
-            dnorm_bias,
-            dlinear_weight,
-            dlinear_bias,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_linear_fn(
-    x,
-    norm_weight,
-    norm_bias,
-    linear_weight,
-    linear_bias,
-    residual=None,
-    eps=1e-6,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    return LayerNormLinearFn.apply(
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual,
-        eps,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-    )

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py DELETED Viewed

@@ -1,389 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or triton==2.2.0 or triton==2.3.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
-@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
-@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
-@triton.heuristics(
-    {
-        "HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"]
-        is not None
-    }
-)
-@triton.heuristics(
-    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
-)
-@triton.jit
-def _selective_scan_update_kernel(
-    # Pointers to matrices
-    state_ptr,
-    x_ptr,
-    dt_ptr,
-    dt_bias_ptr,
-    A_ptr,
-    B_ptr,
-    C_ptr,
-    D_ptr,
-    z_ptr,
-    out_ptr,
-    state_batch_indices_ptr,
-    # Matrix dimensions
-    batch,
-    nheads,
-    dim,
-    dstate,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_state_batch,
-    stride_state_head,
-    stride_state_dim,
-    stride_state_dstate,
-    stride_x_batch,
-    stride_x_head,
-    stride_x_dim,
-    stride_dt_batch,
-    stride_dt_head,
-    stride_dt_dim,
-    stride_dt_bias_head,
-    stride_dt_bias_dim,
-    stride_A_head,
-    stride_A_dim,
-    stride_A_dstate,
-    stride_B_batch,
-    stride_B_group,
-    stride_B_dstate,
-    stride_C_batch,
-    stride_C_group,
-    stride_C_dstate,
-    stride_D_head,
-    stride_D_dim,
-    stride_z_batch,
-    stride_z_head,
-    stride_z_dim,
-    stride_out_batch,
-    stride_out_head,
-    stride_out_dim,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    TIE_HDIM: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    HAS_D: tl.constexpr,
-    HAS_Z: tl.constexpr,
-    HAS_STATE_BATCH_INDICES: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_m = tl.program_id(axis=0)
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    if HAS_STATE_BATCH_INDICES:
-        state_batch_indices_ptr += pid_b
-        state_batch_idx = tl.load(state_batch_indices_ptr)
-        state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
-    else:
-        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
-    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
-    if HAS_DT_BIAS:
-        dt_bias_ptr += pid_h * stride_dt_bias_head
-    A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
-    if HAS_Z:
-        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
-    state_ptrs = state_ptr + (
-        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
-    )
-    x_ptrs = x_ptr + offs_m * stride_x_dim
-    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
-    if HAS_DT_BIAS:
-        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
-    if HAS_D:
-        D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (
-        offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
-    )
-    B_ptrs = B_ptr + offs_n * stride_B_dstate
-    C_ptrs = C_ptr + offs_n * stride_C_dstate
-    if HAS_D:
-        D_ptrs = D_ptr + offs_m * stride_D_dim
-    if HAS_Z:
-        z_ptrs = z_ptr + offs_m * stride_z_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    state = tl.load(
-        state_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-    )
-    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(
-            A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-        ).to(tl.float32)
-        dA = tl.exp(A * dt[:, None])
-    else:
-        dt = tl.load(dt_ptr).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptr).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(A_ptr).to(tl.float32)
-        dA = tl.exp(A * dt)  # scalar, not a matrix
-    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    if HAS_D:
-        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if HAS_Z:
-        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dB = B[None, :] * dt[:, None]
-    else:
-        dB = B * dt  # vector of size (dstate,)
-    state = state * dA + dB * x[:, None]
-    tl.store(
-        state_ptrs, state, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    )
-    out = tl.sum(state * C[None, :], axis=1)
-    if HAS_D:
-        out += x * D
-    if HAS_Z:
-        out *= z * tl.sigmoid(z)
-    tl.store(out_ptrs, out, mask=offs_m < dim)
-def selective_state_update(
-    state,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    state_batch_indices=None,
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    _, nheads, dim, dstate = state.shape
-    batch = x.shape[0]
-    if x.shape != (batch, nheads, dim):
-        print(f"{state.shape} {x.shape} {batch} {nheads} {dim}")
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-    if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch,)
-    out = torch.empty_like(x)
-    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
-    z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
-    # We don't want autotune since it will overwrite the state
-    # We instead tune by hand.
-    BLOCK_SIZE_M, num_warps = (
-        (32, 4)
-        if dstate <= 16
-        else (
-            (16, 4)
-            if dstate <= 32
-            else ((8, 4) if dstate <= 64 else ((4, 4) if dstate <= 128 else ((4, 8))))
-        )
-    )
-    tie_hdim = (
-        A.stride(-1) == 0
-        and A.stride(-2) == 0
-        and dt.stride(-1) == 0
-        and dt_bias.stride(-1) == 0
-    )
-    with torch.cuda.device(x.device.index):
-        _selective_scan_update_kernel[grid](
-            state,
-            x,
-            dt,
-            dt_bias,
-            A,
-            B,
-            C,
-            D,
-            z,
-            out,
-            state_batch_indices,
-            batch,
-            nheads,
-            dim,
-            dstate,
-            nheads // ngroups,
-            state.stride(0),
-            state.stride(1),
-            state.stride(2),
-            state.stride(3),
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
-            A.stride(0),
-            A.stride(1),
-            A.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            C.stride(0),
-            C.stride(1),
-            C.stride(2),
-            *(D.stride(0), D.stride(1)) if D is not None else 0,
-            z_strides[0],
-            z_strides[1],
-            z_strides[2],
-            out.stride(0),
-            out.stride(1),
-            out.stride(2),
-            dt_softplus,
-            tie_hdim,
-            BLOCK_SIZE_M,
-            num_warps=num_warps,
-        )
-    if not has_heads:
-        out = out.squeeze(1)
-    return out
-def selective_state_update_ref(
-    state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    batch, nheads, dim, dstate = state.shape
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-        dt = dt + dt_bias
-    dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(
-        rearrange(dt, "b h d -> b h d 1") * A
-    )  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
-        B, "b h n -> b h 1 n"
-    )  # (batch, nheads, dim, dstate)
-    state.copy_(
-        state * dA + dB * rearrange(x, "b h d -> b h d 1")
-    )  # (batch, dim, dstate
-    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
-    if D is not None:
-        out += (x * D).to(out.dtype)
-    out = (out if z is None else out * F.silu(z)).to(x.dtype)
-    if not has_heads:
-        out = out.squeeze(1)
-    return out

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py DELETED Viewed

The diff for this file is too large to render. See raw diff

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py DELETED Viewed

@@ -1,2012 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_SIZE_H": 1}),
-        triton.Config({"BLOCK_SIZE_H": 2}),
-        triton.Config({"BLOCK_SIZE_H": 4}),
-        triton.Config({"BLOCK_SIZE_H": 8}),
-        triton.Config({"BLOCK_SIZE_H": 16}),
-        triton.Config({"BLOCK_SIZE_H": 32}),
-        triton.Config({"BLOCK_SIZE_H": 64}),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_fwd_kernel(
-    # Pointers to matrices
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    dt_out_ptr,
-    dA_cumsum_ptr,
-    # Matrix dimension
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_dt_out_batch,
-    stride_dt_out_chunk,
-    stride_dt_out_head,
-    stride_dt_out_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    dt_out_ptrs = dt_out_ptr + (
-        offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize
-    )
-    dA_cs_ptrs = dA_cumsum_ptr + (
-        offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize
-    )
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt = tl.where(dt <= 20.0, softplus(dt), dt)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    tl.store(
-        dt_out_ptrs,
-        dt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    dA = dt * A[:, None]
-    dA_cs = tl.cumsum(dA, axis=1)
-    tl.store(
-        dA_cs_ptrs,
-        dA_cs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_H": 1}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 2}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 4}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 8}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 16}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 32}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 64}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_bwd_kernel(
-    # Pointers to matrices
-    ddA_ptr,
-    ddt_out_ptr,
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    ddt_ptr,
-    dA_ptr,
-    ddt_bias_ptr,
-    # Matrix dimensions
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_ddA_batch,
-    stride_ddA_chunk,
-    stride_ddA_head,
-    stride_ddA_csize,
-    stride_ddt_out_batch,
-    stride_ddt_out_chunk,
-    stride_ddt_out_head,
-    stride_ddt_out_csize,
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_ddt_batch,
-    stride_ddt_seqlen,
-    stride_ddt_head,
-    stride_dA_head,
-    stride_ddt_bias_head,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    ddt_out_ptr += pid_b * stride_ddt_out_batch + pid_c * stride_ddt_out_chunk
-    ddA_ptr += pid_b * stride_ddA_batch + pid_c * stride_ddA_chunk
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    ddt_ptr += pid_b * stride_ddt_batch + pid_c * chunk_size * stride_ddt_seqlen
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    ddt_out_ptrs = ddt_out_ptr + (
-        offs_h[:, None] * stride_ddt_out_head + offs_c[None, :] * stride_ddt_out_csize
-    )
-    ddA_ptrs = ddA_ptr + (
-        offs_h[:, None] * stride_ddA_head + offs_c[None, :] * stride_ddA_csize
-    )
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    ddt_ptrs = ddt_ptr + (
-        offs_h[:, None] * stride_ddt_head + offs_c[None, :] * stride_ddt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    ddA = tl.load(
-        ddA_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    ddt_out = tl.load(
-        ddt_out_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    ddt = ddA * A[:, None] + ddt_out
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt_presoftplus = dt
-        dt = tl.where(dt <= 20.0, softplus(dt), ddt)
-    clamp_mask = (dt < dt_min) | (dt > dt_max)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    ddt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), ddt, 0.0
-    )
-    ddt = tl.where(clamp_mask, 0.0, ddt)
-    if DT_SOFTPLUS:
-        ddt = tl.where(dt_presoftplus <= 20.0, ddt * tl.sigmoid(dt_presoftplus), ddt)
-    tl.store(
-        ddt_ptrs,
-        ddt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-    )
-    dA = tl.sum(ddA * dt, axis=1)
-    tl.atomic_add(dA_ptr + offs_h * stride_dA_head, dA, mask=offs_h < nheads)
-    if HAS_DT_BIAS:
-        ddt_bias = tl.sum(ddt, axis=1)
-        tl.atomic_add(
-            ddt_bias_ptr + offs_h * stride_ddt_bias_head, ddt_bias, mask=offs_h < nheads
-        )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_fwd_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    states_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_states_batch,
-    stride_states_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    if HAS_SEQ_IDX:
-        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    if HAS_SEQ_IDX:
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        if HAS_SEQ_IDX:
-            seq_idx_k = tl.load(
-                seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1
-            )
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        if not HAS_SEQ_IDX:
-            scale = tl.exp((dA_cs_last - dA_cs_k)) * dt_k
-        else:
-            scale = tl.where(
-                seq_idx_k == seq_idx_last, tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0
-            )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-        if HAS_SEQ_IDX:
-            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += (
-        pid_b * stride_states_batch
-        + pid_c * stride_states_chunk
-        + pid_h * stride_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    dx_ptr,
-    ddt_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-        tl.float32
-    )
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    acc *= tl.exp(dA_cs_last - dA_cs_m)[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    ddA_cs = -(ddt * dt_m)
-    ddA_cs_last = -tl.sum(ddA_cs)
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(ddA_cumsum_ptr + (chunk_size - 1) * stride_ddA_cs_csize, ddA_cs_last)
-    dx = (acc * dt_m[:, None]).to(dx_ptr.dtype.element_ty)
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "dstate", "hdim"],
-)
-@triton.jit
-def _chunk_state_bwd_db_kernel(
-    # Pointers to matrices
-    x_ptr,
-    dstates_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    db_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    dstate,
-    hdim,
-    batch,
-    seqlen,
-    nheads,
-    nheads_per_program,
-    ngroups,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_db_batch,
-    stride_db_seqlen,
-    stride_db_split,
-    stride_db_group,
-    stride_db_dstate,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_DDA_CS: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_sg = tl.program_id(axis=2)
-    pid_s = pid_sg // ngroups
-    pid_g = pid_sg - pid_s * ngroups
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_x_head
-    )
-    db_ptr += (
-        pid_b * stride_db_batch
-        + pid_c * chunk_size * stride_db_seqlen
-        + pid_g * stride_db_group
-        + pid_s * stride_db_split
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-        * stride_states_head
-    )
-    dt_ptr += (
-        pid_b * stride_dt_batch
-        + pid_c * stride_dt_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
-    )
-    if HAS_DDA_CS:
-        b_ptr += (
-            pid_b * stride_b_batch
-            + pid_c * chunk_size * stride_b_seqlen
-            + pid_g * stride_b_head
-        )
-        ddA_cumsum_ptr += (
-            pid_b * stride_ddA_cs_batch
-            + pid_c * stride_ddA_cs_chunk
-            + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-            * stride_ddA_cs_head
-        )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_k[None, :] * stride_x_hdim
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_dstate + offs_k[:, None] * stride_states_hdim
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    if HAS_DDA_CS:
-        b_ptrs = b_ptr + (
-            offs_m[:, None] * stride_b_seqlen + offs_n[None, :] * stride_b_dstate
-        )
-        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    if HAS_DDA_CS:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-    if HAS_SEQ_IDX:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    nheads_iter = min(
-        nheads_per_program, nheads // ngroups - pid_s * nheads_per_program
-    )
-    for h in range(nheads_iter):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = dstates.to(x_ptrs.dtype.element_ty)
-        db = tl.dot(x, dstates)
-        dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-            tl.float32
-        )
-        dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-            tl.float32
-        )
-        dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-        if not HAS_SEQ_IDX:
-            scale = tl.exp(dA_cs_last - dA_cs_m)
-        else:
-            scale = tl.where(
-                seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0
-            )
-        db *= (scale * dt_m)[:, None]
-        if HAS_DDA_CS:
-            # This is the gradient wrt (dA_cs_last - dA_cs_m), i.e. the exclusive reverse cumsum
-            ddA_cs = tl.sum(db * b, axis=1)
-            tl.atomic_add(
-                ddA_cumsum_ptrs + stride_ddA_cs_csize,
-                ddA_cs,
-                mask=offs_m < chunk_size - 1,
-            )
-        acc += db
-        x_ptrs += stride_x_head
-        dstates_ptrs += stride_states_head
-        dt_ptrs += stride_dt_head
-        dA_cumsum_ptr += stride_dA_cs_head
-        dA_cumsum_ptrs += stride_dA_cs_head
-        if HAS_DDA_CS:
-            ddA_cumsum_ptrs += stride_ddA_cs_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    # if HAS_SEQ_IDX:
-    #     seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-    #     seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-    #     acc = tl.where(seq_idx_m[:, None] == seq_idx_last, acc, 0.0)
-    db_ptrs = db_ptr + (
-        offs_m[:, None] * stride_db_seqlen + offs_n[None, :] * stride_db_dstate
-    )
-    tl.store(
-        db_ptrs,
-        acc,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-    )
-@triton.autotune(
-    configs=[
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_ddAcs_stable_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    acc *= scale[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    # ddA_cs = -(ddt * dt_m)
-    # Triton 2.2.0 errors if we have the cumsum here, so we just write it out
-    # then call torch.cumsum outside this kernel.
-    # ddA_cs = tl.cumsum(ddt * dt_m)
-    ddA_cs = ddt * dt_m
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    # tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(
-        ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_varlen_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    chunk_states_ptr,
-    cu_seqlens_ptr,
-    states_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_chunk_states_chunk,
-    stride_chunk_states_head,
-    stride_chunk_states_hdim,
-    stride_chunk_states_dstate,
-    stride_states_batch,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
-    pid_c = (end_idx - 1) // chunk_size
-    b_ptr += (
-        pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    chunk_states_ptr += (
-        pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(
-        dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
-    ).to(tl.float32)
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    chunk_size_limit = end_idx - pid_c * chunk_size
-    start_idx = tl.load(cu_seqlens_ptr + pid_b)
-    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim)
-            & (offs_k[None, :] < chunk_size_limit - k)
-            & (offs_k[None, :] >= start_idx_cur - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k)
-            & (offs_n[None, :] < dstate)
-            & (offs_k[:, None] >= start_idx_cur - k),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        scale = tl.where(
-            (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
-            tl.exp((dA_cs_last - dA_cs_k)) * dt_k,
-            0.0,
-        )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
-    if start_idx < pid_c * chunk_size:
-        chunk_states_ptrs = chunk_states_ptr + (
-            offs_m[:, None] * stride_chunk_states_hdim
-            + offs_n[None, :] * stride_chunk_states_dstate
-        )
-        chunk_states = tl.load(
-            chunk_states_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        # scale = tl.where(start_idx < pid_c * chunk_size, tl.exp(dA_cs_last), 0.0)
-        scale = tl.exp(dA_cs_last)
-        acc += chunk_states * scale
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-def _chunk_cumsum_fwd(
-    dt, A, chunk_size, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))
-):
-    batch, seqlen, nheads = dt.shape
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-    nchunks = math.ceil(seqlen / chunk_size)
-    dt_out = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    dA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
-            dt,
-            A,
-            dt_bias,
-            dt_out,
-            dA_cumsum,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            dt_out.stride(0),
-            dt_out.stride(2),
-            dt_out.stride(1),
-            dt_out.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return dA_cumsum, dt_out
-def _chunk_cumsum_bwd(
-    ddA,
-    ddt_out,
-    dt,
-    A,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    ddt=None,
-):
-    batch, seqlen, nheads = dt.shape
-    _, _, nchunks, chunk_size = ddA.shape
-    assert ddA.shape == (batch, nheads, nchunks, chunk_size)
-    assert ddt_out.shape == (batch, nheads, nchunks, chunk_size)
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-        ddt_bias = torch.empty_like(dt_bias, dtype=torch.float32)
-    else:
-        ddt_bias = None
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-    else:
-        ddt = torch.empty_like(dt)
-    dA = torch.empty_like(A, dtype=torch.float32)
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_bwd_kernel[grid_chunk_cs](
-            ddA,
-            ddt_out,
-            dt,
-            A,
-            dt_bias,
-            ddt,
-            dA,
-            ddt_bias,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            ddA.stride(0),
-            ddA.stride(2),
-            ddA.stride(1),
-            ddA.stride(3),
-            ddt_out.stride(0),
-            ddt_out.stride(2),
-            ddt_out.stride(1),
-            ddt_out.stride(3),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            ddt.stride(0),
-            ddt.stride(1),
-            ddt.stride(2),
-            dA.stride(0),
-            ddt_bias.stride(0) if ddt_bias is not None else 0,
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return ddt, dA, ddt_bias
-def _chunk_state_fwd(
-    B, x, dt, dA_cumsum, seq_idx=None, states=None, states_in_fp32=True
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if states is not None:
-        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
-    else:
-        states_dtype = torch.float32 if states_in_fp32 else B.dtype
-        states = torch.empty(
-            (batch, nchunks, nheads, headdim, dstate),
-            device=x.device,
-            dtype=states_dtype,
-        )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_fwd_kernel[grid](
-            x,
-            B,
-            states,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            headdim,
-            dstate,
-            chunk_size,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-            states.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    return states
-def _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates, dx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if dx is not None:
-        assert dx.shape == x.shape
-    else:
-        dx = torch.empty_like(x)
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dA_cumsum.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            dx,
-            ddt,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    return dx, ddt.to(dt.dtype), ddA_cumsum.to(dA_cumsum.dtype)
-def _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=None, B=None, ngroups=1):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    dstate = dstates.shape[-1]
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B is not None:
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        B_strides = (B.stride(0), B.stride(1), B.stride(2), B.stride(3))
-        # Use torch.empty since the Triton kernel will call init_to_zero
-        ddA_cumsum = torch.empty(
-            batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-        )
-        ddA_cumsum_strides = (
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-        )
-    else:
-        B_strides = (0, 0, 0, 0)
-        ddA_cumsum = None
-        ddA_cumsum_strides = (0, 0, 0, 0)
-    nheads_ngroups_ratio = nheads // ngroups
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    nheads_per_program = max(
-        min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1
-    )
-    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
-    dB = torch.empty(
-        batch, seqlen, nsplits, ngroups, dstate, device=x.device, dtype=torch.float32
-    )
-    grid_db = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nsplits * ngroups,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_db_kernel[grid_db](
-            x,
-            dstates,
-            B,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            dB,
-            ddA_cumsum,
-            chunk_size,
-            dstate,
-            headdim,
-            batch,
-            seqlen,
-            nheads,
-            nheads_per_program,
-            ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            *B_strides,
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            dB.stride(0),
-            dB.stride(1),
-            dB.stride(2),
-            dB.stride(3),
-            dB.stride(4),
-            *ddA_cumsum_strides,
-            HAS_DDA_CS=ddA_cumsum is not None,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
-        )
-    dB = dB.sum(2)
-    if ddA_cumsum is not None:
-        # The first element of ddA_cumsum is always zero, since that dA_cumsum does not contribute
-        # to the state of the chunk.
-        # torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-        # But it's easier to just do the cumsum for all elements, the result will be the same.
-        torch.cumsum(ddA_cumsum, dim=-1, out=ddA_cumsum)
-    return dB if B is None else (dB, ddA_cumsum)
-def _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    # Use torch.empty since the Triton kernel will call init_to_zero
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-    )
-    grid_ddtcs = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_ddAcs_stable_kernel[grid_ddtcs](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_M=max(triton.next_power_of_2(chunk_size), 16),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-    return ddA_cumsum
-def chunk_state_varlen(B, x, dt, dA_cumsum, cu_seqlens, chunk_states):
-    total_seqlen, nheads, headdim = x.shape
-    _, nchunks, chunk_size = dt.shape
-    _, ngroups, dstate = B.shape
-    batch = cu_seqlens.shape[0] - 1
-    cu_seqlens = cu_seqlens.contiguous()
-    assert nheads % ngroups == 0
-    assert B.shape == (total_seqlen, ngroups, dstate)
-    assert dt.shape == (nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
-    states = torch.empty(
-        batch,
-        nheads,
-        headdim,
-        dstate,
-        dtype=chunk_states.dtype,
-        device=chunk_states.device,
-    )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_varlen_kernel[grid](
-            x,
-            B,
-            dt,
-            dA_cumsum,
-            chunk_states,
-            cu_seqlens,
-            states,
-            headdim,
-            dstate,
-            chunk_size,
-            total_seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            dt.stride(1),
-            dt.stride(0),
-            dt.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            chunk_states.stride(0),
-            chunk_states.stride(1),
-            chunk_states.stride(2),
-            chunk_states.stride(3),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-        )
-    return states
-class ChunkStateFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, B, x, dt, dA_cumsum, states_in_fp32=True):
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        assert seqlen <= nchunks * chunk_size
-        _, _, ngroups, dstate = B.shape
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        assert dt.shape == (batch, nheads, nchunks, chunk_size)
-        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if (
-            x.stride(-1) != 1 and x.stride(1) != 1
-        ):  # Either M or K dimension should be contiguous
-            x = x.contiguous()
-        states = _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=states_in_fp32)
-        ctx.save_for_backward(B, x, dt, dA_cumsum)
-        return states
-    @staticmethod
-    def backward(ctx, dstates):
-        B, x, dt, dA_cumsum = ctx.saved_tensors
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        _, _, ngroups, dstate = B.shape
-        assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-        if dstates.stride(-1) != 1:
-            dstates = dstates.contiguous()
-        dx, ddt, ddA_cumsum = _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates)
-        dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, ngroups=ngroups)
-        dB = dB.to(B.dtype)
-        return dB, dx, ddt, ddA_cumsum, None
-def chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    return ChunkStateFn.apply(B, x, dt, dA_cumsum, states_in_fp32)
-def chunk_state_ref(B, x, dt, dA_cumsum):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    # Check constraints.
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    _, _, nchunks, chunk_size = dt.shape
-    assert seqlen <= nchunks * chunk_size
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    ngroups = B.shape[2]
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
-    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-    if seqlen < nchunks * chunk_size:
-        x = F.pad(x, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-        B = F.pad(B, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-    x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
-    B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
-    decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum(
-        "bclhn,bhcl,bhcl,bclhp->bchpn",
-        B.to(x.dtype),
-        decay_states.to(x.dtype),
-        dt.to(x.dtype),
-        x,
-    )

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py DELETED Viewed

@@ -1,1884 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-from typing import Optional
-import math
-from packaging import version
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_cuda = None, None
-from .ssd_bmm import _bmm_chunk_fwd, _bmm_chunk_bwd
-from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_cumsum_bwd
-from .ssd_chunk_state import _chunk_state_fwd, _chunk_state_bwd_db
-from .ssd_chunk_state import _chunk_state_bwd_ddAcs_stable
-from .ssd_chunk_state import chunk_state, chunk_state_ref
-from .ssd_chunk_state import chunk_state_varlen
-from .ssd_state_passing import _state_passing_fwd, _state_passing_bwd
-from .ssd_state_passing import state_passing, state_passing_ref
-from .ssd_chunk_scan import _chunk_scan_fwd, _chunk_scan_bwd_dz, _chunk_scan_bwd_dstates
-from .ssd_chunk_scan import _chunk_scan_bwd_dC, _chunk_scan_bwd_dcb
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_stable
-from .ssd_chunk_scan import chunk_scan, chunk_scan_ref
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_prev
-from .layernorm_gated import rmsnorm_fn, _layer_norm_fwd, _layer_norm_bwd
-from .k_activations import _swiglu_fwd, _swiglu_bwd
-TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_scan_chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    cb_ptr,
-    dout_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    D_ptr,
-    b_ptr,
-    dstates_ptr,
-    dx_ptr,
-    ddt_ptr,
-    dD_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_cb_batch,
-    stride_cb_chunk,
-    stride_cb_head,
-    stride_cb_csize_m,
-    stride_cb_csize_k,
-    stride_dout_batch,
-    stride_dout_seqlen,
-    stride_dout_head,
-    stride_dout_hdim,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_D_head,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_dstates_head,
-    stride_dstates_hdim,
-    stride_dstates_dstate,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_dD_batch,
-    stride_dD_chunk,
-    stride_dD_head,
-    stride_dD_csize,
-    stride_dD_hdim,
-    # Meta-parameters
-    HAS_D: tl.constexpr,
-    D_HAS_HDIM: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-    IS_TRITON_22: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    cb_ptr += (
-        pid_b * stride_cb_batch
-        + pid_c * stride_cb_chunk
-        + (pid_h // nheads_ngroups_ratio) * stride_cb_head
-    )
-    dout_ptr += (
-        pid_b * stride_dout_batch
-        + pid_c * chunk_size * stride_dout_seqlen
-        + pid_h * stride_dout_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_dstates_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize,
-        mask=offs_m < chunk_size_limit,
-        other=0.0,
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    # Might be faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    # However, we're getting error with the Triton compiler 2.1.0 for that code path:
-    # Unexpected mma -> mma layout conversion
-    # Triton 2.2.0 fixes this
-    offs_dstate = tl.arange(
-        0,
-        (
-            BLOCK_SIZE_DSTATE
-            if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128
-            else BLOCK_SIZE_K
-        ),
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_dstate[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_dstates_hdim
-        + offs_dstate[:, None] * stride_dstates_dstate
-    )
-    if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_dstate[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates) * scale[:, None]
-    else:
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_dstate[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_dstate[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_dstates_dstate
-        acc *= scale[:, None]
-    # x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    # x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    # dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    # dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    # ddt = tl.sum(acc * x, axis=1) * dt_m
-    # ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    # tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    cb_ptrs = cb_ptr + (
-        offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k
-    )
-    dout_ptrs = dout_ptr + (
-        offs_k[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    K_MAX = chunk_size_limit
-    K_MIN = pid_m * BLOCK_SIZE_M
-    cb_ptrs += K_MIN * stride_cb_csize_k
-    dout_ptrs += K_MIN * stride_dout_seqlen
-    dA_cumsum_ptrs += K_MIN * stride_dA_cs_csize
-    for k in range(K_MIN, K_MAX, BLOCK_SIZE_K):
-        k = tl.multiple_of(k, BLOCK_SIZE_K)
-        # For some reason setting mask to (offs_m[:, None] < chunk_size_limit) is much slower
-        cb = tl.load(
-            cb_ptrs,
-            mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < K_MAX - k),
-            other=0.0,
-        )
-        dout = tl.load(
-            dout_ptrs,
-            mask=(offs_k[:, None] < K_MAX - k) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < K_MAX - k, other=0.0).to(
-            tl.float32
-        )
-        cb *= tl.exp(dA_cs_k[None, :] - dA_cs_m[:, None])
-        # If we don't have the (k + offs_k[None, :] < K_MAX) mask, for indices outside this range,
-        # we might have dA_cs_m = 0.0 and dA_cs_k very negative, and tl.exp will return inf.
-        # Multiplying with cb, which is 0.0 outside the range, will make the result NaN.
-        # This will cause NaN in acc, and hence NaN in dx and ddt.
-        mask = (k + offs_k[None, :] >= offs_m[:, None]) & (k + offs_k[None, :] < K_MAX)
-        cb = tl.where(mask, cb, 0.0)
-        cb = cb.to(dout_ptr.dtype.element_ty)
-        acc += tl.dot(cb, dout)
-        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
-        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    dx = acc * dt_m[:, None]
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    if HAS_D:
-        dout_res_ptrs = dout_ptr + (
-            offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-        )
-        dout_res = tl.load(
-            dout_res_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-            other=0.0,
-        ).to(tl.float32)
-        if D_HAS_HDIM:
-            D = tl.load(
-                D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0
-            ).to(tl.float32)
-        else:
-            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
-        dx += dout_res * D
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_D:
-        dD_ptr += (
-            pid_b * stride_dD_batch
-            + pid_c * stride_dD_chunk
-            + pid_h * stride_dD_head
-            + pid_m * stride_dD_csize
-        )
-        if D_HAS_HDIM:
-            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
-            dD = tl.sum(dout_res * x, axis=0)
-            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
-        else:
-            dD = tl.sum(dout_res * x)
-            tl.store(dD_ptr, dD)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-def _chunk_scan_chunk_state_bwd_dx(
-    x, dt, dA_cumsum, B, CB, dout, dstates, D=None, seq_idx=None, dx=None
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert CB.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dout.shape == x.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-        assert D.stride(-1) == 1
-        BLOCK_SIZE_min = 32
-        dD = torch.empty(
-            triton.cdiv(chunk_size, BLOCK_SIZE_min),
-            batch,
-            nchunks,
-            nheads,
-            headdim if D.dim() == 2 else 1,
-            device=D.device,
-            dtype=torch.float32,
-        )
-    else:
-        dD = None
-    dD_strides = (
-        (dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
-        if D is not None
-        else (0, 0, 0, 0, 0)
-    )
-    if dx is None:
-        dx = torch.empty_like(x)
-    else:
-        assert dx.shape == x.shape
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_scan_chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            CB,
-            dout,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            D,
-            B,
-            dstates,
-            dx,
-            ddt,
-            dD,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            CB.stride(0),
-            CB.stride(1),
-            CB.stride(2),
-            CB.stride(-1),
-            CB.stride(-2),
-            dout.stride(0),
-            dout.stride(1),
-            dout.stride(2),
-            dout.stride(3),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            D.stride(0) if D is not None else 0,
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            dD_strides[1],
-            dD_strides[2],
-            dD_strides[3],
-            dD_strides[0],
-            dD_strides[4],
-            D is not None,
-            D.dim() == 2 if D is not None else True,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-            IS_TRITON_22=TRITON_22
-        )
-    if D is not None:
-        BLOCK_SIZE_actual = _chunk_scan_chunk_state_bwd_dx_kernel.best_config.kwargs[
-            "BLOCK_SIZE_M"
-        ]
-        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
-        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
-        if D.dim() == 1:
-            dD = rearrange(dD, "h 1 -> h")
-    return dx, ddt.to(dtype=dt.dtype), dD
-def _mamba_chunk_scan_combined_fwd(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert C.shape == B.shape
-    if z is not None:
-        assert z.shape == x.shape
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if (
-        x.stride(-1) != 1 and x.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        x = x.contiguous()
-    if (
-        z is not None and z.stride(-1) != 1 and z.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        z = z.contiguous()
-    if D is not None and D.stride(-1) != 1:
-        D = D.contiguous()
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    # # (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, nheads, chunk_size, chunk_size)
-    # dA_cumsum_tmp0, dt_tmp0 = _chunk_cumsum_fwd(dt[:, :147], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp1, dt_tmp1 = _chunk_cumsum_fwd(dt[:, 147:], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp2, dt_tmp2 = _chunk_cumsum_fwd(dt[:, 147:256], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit
-    )
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    # states_tmp0 = _chunk_state_fwd(B[:, :147], x[:, :147], dt_tmp0, dA_cumsum_tmp0, states_in_fp32=True)
-    # states_tmp1 = _chunk_state_fwd(B[:, 147:], x[:, 147:], dt_tmp1, dA_cumsum_tmp1, states_in_fp32=True)
-    # states_tmp2 = _chunk_state_fwd(B[:, 147:256], x[:, 147:256], dt_tmp2, dA_cumsum_tmp2, states_in_fp32=True)
-    states, final_states = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-        out_dtype=C.dtype,
-    )
-    states, final_states = [
-        rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]
-    ]
-    # states_tmp0 = rearrange(_state_passing_fwd(rearrange(states_tmp0, "... p n -> ... (p n)"), dA_cumsum_tmp0[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    # states_tmp1 = rearrange(_state_passing_fwd(rearrange(states_tmp1, "... p n -> ... (p n)"), dA_cumsum_tmp1[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    out, out_x = _chunk_scan_fwd(
-        CB, x, dt, dA_cumsum, C, states, D=D, z=z, seq_idx=seq_idx
-    )
-    if cu_seqlens is None:
-        return out, out_x, dt, dA_cumsum, states, final_states
-    else:
-        assert (
-            batch == 1
-        ), "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
-        varlen_states = chunk_state_varlen(
-            B.squeeze(0),
-            x.squeeze(0),
-            dt.squeeze(0),
-            dA_cumsum.squeeze(0),
-            cu_seqlens,
-            states.squeeze(0),
-        )
-        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
-def _mamba_chunk_scan_combined_bwd(
-    dout,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    out,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    dfinal_states=None,
-    seq_idx=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    dx=None,
-    ddt=None,
-    dB=None,
-    dC=None,
-    dz=None,
-    recompute_output=False,
-):
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    batch, seqlen, nheads, headdim = x.shape
-    nchunks = math.ceil(seqlen / chunk_size)
-    _, _, ngroups, dstate = B.shape
-    assert dout.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert C.shape == B.shape
-    assert out.shape == x.shape
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if dx is not None:
-        assert dx.shape == x.shape
-    if dB is not None:
-        assert dB.shape == B.shape
-        dB_given = dB
-    else:
-        dB_given = torch.empty_like(B)
-    if dC is not None:
-        assert dC.shape == C.shape
-        dC_given = dC
-    else:
-        dC_given = torch.empty_like(C)
-    if dz is not None:
-        assert z is not None
-        assert dz.shape == z.shape
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-        ddt_given = ddt
-    else:
-        ddt_given = torch.empty_like(dt)
-    # TD: For some reason Triton (2.1.0 and 2.2.0) errors with
-    # "[CUDA]: invalid device context" (e.g. during varlne test), and cloning makes it work. Idk why.
-    dt_in = dt.clone()
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt_in,
-        A,
-        chunk_size,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    states, _ = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-    )
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    if z is not None:
-        dz, dout, dD, *rest = _chunk_scan_bwd_dz(
-            x,
-            z,
-            out,
-            dout,
-            chunk_size=chunk_size,
-            has_ddAcs=False,
-            D=D,
-            dz=dz,
-            recompute_output=recompute_output,
-        )
-        outz = rest[0] if recompute_output else out
-    else:
-        dz = None
-        outz = out
-    dstates = _chunk_scan_bwd_dstates(
-        C, dA_cumsum, dout, seq_idx=seq_idx, dtype=states.dtype
-    )
-    # dstates has length nchunks, containing the gradient to initial states at index 0 and
-    # gradient to the states of chunk (nchunks - 2) at index (nchunks - 1)
-    # Do computation in fp32 but convert dstates and states to fp16/bf16 since dstates and states
-    # will be used in matmul in the next kernels.
-    dstates, ddA_chunk_cumsum, dinitial_states, states = _state_passing_bwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        rearrange(dstates, "... p n -> ... (p n)"),
-        dfinal_states=(
-            rearrange(dfinal_states, "... p n -> ... (p n)")
-            if dfinal_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        has_initial_states=initial_states is not None,
-        dstates_dtype=x.dtype,
-        states_dtype=x.dtype,
-        chunk_size=chunk_size,
-    )
-    # dstates has length nchunks, containing the gradient to states of chunk 0 at index 0 and
-    # gradient to the final states at index (nchunks - 1)
-    # states has length nchunks, containing the initial states at index 0 and the state for chunk (nchunks - 2) at index (nchunks - 1)
-    # The final states is not stored.
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    dstates = rearrange(dstates, "... (p n) -> ... p n", n=dstate)
-    dinitial_states = (
-        rearrange(dinitial_states, "... (p n) -> ... p n", n=dstate)
-        if dinitial_states is not None
-        else None
-    )
-    dx, ddt, dD_from_x = _chunk_scan_chunk_state_bwd_dx(
-        x, dt, dA_cumsum, B, CB, dout, dstates, D=D, seq_idx=seq_idx, dx=dx
-    )
-    # dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, ngroups=ngroups)
-    dB, ddA_next = _chunk_state_bwd_db(
-        x, dt, dA_cumsum, dstates, seq_idx=seq_idx, B=B, ngroups=ngroups
-    )
-    # dC = _chunk_scan_bwd_dC(states[:, :-1].to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    dC, ddA_cumsum_prev = _chunk_scan_bwd_dC(
-        states.to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, C=C, ngroups=ngroups
-    )
-    # Computing ddA with the dcb kernel is much slower, so we're not using it for now
-    dCB = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    # dCB, ddA_tmp = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, CB=CB, ngroups=ngroups)
-    dCB = dCB.to(CB.dtype)
-    _bmm_chunk_bwd(C, dCB, residual=dB, out=dB_given)
-    _bmm_chunk_bwd(B, rearrange(dCB, "... l s -> ... s l"), residual=dC, out=dC_given)
-    # If we have z, then dout_x is recomputed in fp32 so dD = (dout_x * x).sum() is more accurate
-    # than dD_from_x = (dout_x * x).sum() where dout_x is in fp16/bf16
-    if z is None:
-        dD = dD_from_x
-    # Formula for ddA_cumsum, assuming out is the output of the forward pass before adding x * D.
-    # ddA_cumsum = torch.einsum("bclhp,bclhp->bhcl", out.float(), dout.float()) - ddt * dt
-    # However, this is numerically unstable: when we do the reverse cumsum on ddA_cumsum, there might
-    # be a lot of underflow.
-    # This is already done as part of bwd_dC kernel
-    # ddA_cumsum_prev = _chunk_scan_bwd_ddAcs_prev(states[:, :-1], C, dout, dA_cumsum, seq_idx=seq_idx)
-    ddA_cumsum_prev[..., -1] += ddA_chunk_cumsum
-    ddA_prev = ddA_cumsum_prev.flip([-1]).cumsum(dim=-1).flip([-1])
-    # This is already done as part of bwd_dB kernel
-    # ddA_next = _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=seq_idx)
-    # We don't need to pass in seq_idx because CB also zeros out entries where seq_idx[i] != seq_idx[j]
-    ddA = _chunk_scan_bwd_ddAcs_stable(x, dt, dA_cumsum, dout, CB)
-    ddA += ddA_next + ddA_prev
-    ddt_given, dA, ddt_bias = _chunk_cumsum_bwd(
-        ddA,
-        ddt,
-        dt_in,
-        A,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-        ddt=ddt_given,
-    )
-    # These 2 lines are just to test ddt and dA being computed by old code
-    # _, dA = selective_scan_bwd(dout, x, dt, A, B, C, D=D.float(), z=z)
-    # ddt_given.copy_(ddt)
-    return_vals = (
-        dx,
-        ddt_given,
-        dA,
-        dB_given,
-        dC_given,
-        dD,
-        dz,
-        ddt_bias,
-        dinitial_states,
-    )
-    return return_vals if not recompute_output else (*return_vals, outz)
-def selective_scan_bwd(dout, x, dt, A, B, C, D=None, z=None):
-    """
-    Argument:
-        dout: (batch, seqlen, nheads, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size) or (batch, nheads, headdim, nchunks, chunk_size)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    import selective_scan
-    batch, seqlen, nheads, headdim = x.shape
-    chunk_size = dt.shape[-1]
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    x = rearrange(x, "b l h p -> b (h p) l")
-    squeeze_dt = dt.dim() == 4
-    if dt.dim() == 4:
-        dt = repeat(dt, "b h c l -> b h p c l", p=headdim)
-    dt = rearrange(dt, "b h p c l -> b (h p) (c l)", p=headdim)
-    squeeze_A = A.dim() == 1
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    if dt.stride(-1) != 1:
-        dt = dt.contiguous()
-    if D is not None:
-        D = D.contiguous()
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if z is not None and z.stride(-1) != 1:
-        z = z.contiguous()
-    _, intermediate, *rest = selective_scan.fwd(
-        x, dt.to(dtype=x.dtype), A, B, C, D, z, None, False
-    )
-    if z is not None:
-        out = rest[0]
-    else:
-        out = None
-    dout = rearrange(dout, "b l h p -> b (h p) l")
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-    # backward of selective_scan with the backward of chunk).
-    # Here we just pass in None and dz will be allocated in the C++ code.
-    _, ddt, dA, *rest = selective_scan.bwd(
-        x,
-        dt.to(dtype=x.dtype),
-        A,
-        B,
-        C,
-        D,
-        z,
-        None,
-        dout,
-        intermediate,
-        out,
-        None,
-        False,
-        False,  # option to recompute out_z, not used here
-    )
-    ddt = rearrange(ddt, "b (h p) (c l) -> b h p c l", p=headdim, l=chunk_size)
-    if squeeze_dt:
-        ddt = ddt.float().sum(dim=2)
-    if squeeze_A:
-        dA = rearrange(dA, "(h p) n -> h p n", p=headdim).sum(dim=(1, 2))
-    return ddt, dA
-class MambaChunkScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D=None,
-        z=None,
-        dt_bias=None,
-        initial_states=None,
-        seq_idx=None,
-        cu_seqlens=None,
-        dt_softplus=False,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        return_varlen_states=False,
-    ):
-        ctx.dt_dtype = dt.dtype
-        if not return_varlen_states:
-            cu_seqlens = None
-        else:
-            assert (
-                cu_seqlens is not None
-            ), "cu_seqlens must be provided if return_varlen_states is True"
-        out, out_x, dt_out, dA_cumsum, states, final_states, *rest = (
-            _mamba_chunk_scan_combined_fwd(
-                x,
-                dt,
-                A,
-                B,
-                C,
-                chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                seq_idx=seq_idx,
-                cu_seqlens=cu_seqlens,
-                dt_softplus=dt_softplus,
-                dt_limit=dt_limit,
-            )
-        )
-        ctx.save_for_backward(
-            out if z is None else out_x,
-            x,
-            dt,
-            dA_cumsum,
-            A,
-            B,
-            C,
-            D,
-            z,
-            dt_bias,
-            initial_states,
-            seq_idx,
-        )
-        ctx.dt_softplus = dt_softplus
-        ctx.chunk_size = chunk_size
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.return_varlen_states = return_varlen_states
-        if not return_varlen_states:
-            return out if not return_final_states else (out, final_states)
-        else:
-            varlen_states = rest[0]
-            return (
-                (out, varlen_states)
-                if not return_final_states
-                else (out, final_states, varlen_states)
-            )
-    @staticmethod
-    def backward(ctx, dout, *args):
-        out, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx = (
-            ctx.saved_tensors
-        )
-        assert (
-            not ctx.return_varlen_states
-        ), "return_varlen_states is not supported in backward"
-        dfinal_states = args[0] if ctx.return_final_states else None
-        dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = (
-            _mamba_chunk_scan_combined_bwd(
-                dout,
-                x,
-                dt,
-                A,
-                B,
-                C,
-                out,
-                ctx.chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                dfinal_states=dfinal_states,
-                seq_idx=seq_idx,
-                dt_softplus=ctx.dt_softplus,
-                dt_limit=ctx.dt_limit,
-            )
-        )
-        return (
-            dx,
-            ddt,
-            dA,
-            dB,
-            dC,
-            None,
-            dD,
-            dz,
-            ddt_bias,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_chunk_scan_combined(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    return_varlen_states=False,
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        chunk_size: int
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen)
-        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
-        dt_softplus: Whether to apply softplus to dt
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    return MambaChunkScanCombinedFn.apply(
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D,
-        z,
-        dt_bias,
-        initial_states,
-        seq_idx,
-        cu_seqlens,
-        dt_softplus,
-        dt_limit,
-        return_final_states,
-        return_varlen_states,
-    )
-def mamba_chunk_scan(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    states = rearrange(
-        state_passing(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    # 3. Compute the output for each chunk
-    out = chunk_scan(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_chunk_scan_combined_ref(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state_ref(B, x, dt, dA_cumsum)
-    states_dtype = states.dtype
-    if states.dtype not in [torch.float32, torch.float64]:
-        states = states.to(torch.float32)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    # state_passing_ref is much less numerically stable
-    states = rearrange(
-        state_passing_ref(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    states = states.to(states_dtype)
-    # 3. Compute the output for each chunk
-    out = chunk_scan_ref(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_selective_scan(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,) or (nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    from ..selective_scan_interface import selective_scan_fn
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    x = rearrange(x, "b l h p -> b (h p) l")
-    if dt.dim() == 3:
-        dt = repeat(dt, "b l h -> b l h p", p=headdim)
-    dt = rearrange(dt, "b l h p -> b (h p) l")
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if dt_bias is not None:
-        if dt_bias.dim() == 1:
-            dt_bias = repeat(dt_bias, "h -> h p", p=headdim)
-        dt_bias = rearrange(dt_bias, "h p -> (h p)")
-    if dt_limit != (0.0, float("inf")):
-        if dt_bias is not None:
-            dt = dt + rearrange(dt_bias, "d -> d 1")
-        if dt_softplus:
-            dt = F.softplus(dt)
-        dt = dt.clamp(min=dt_limit[0], max=dt_limit[1]).to(x.dtype)
-        dt_bias = None
-        dt_softplus = None
-    out = selective_scan_fn(
-        x, dt, A, B, C, D=D, z=z, delta_bias=dt_bias, delta_softplus=dt_softplus
-    )
-    return rearrange(out, "b (h p) l -> b l h p", p=headdim)
-def mamba_conv1d_scan_ref(
-    xBC,
-    conv1d_weight,
-    conv1d_bias,
-    dt,
-    A,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    headdim=None,
-    ngroups=1,
-):
-    """
-    Argument:
-        xBC: (batch, seqlen, dim + 2 * ngroups * dstate) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, dim)
-        dt_bias: (nheads) or (nheads, headdim)
-        headdim: if D is 1D and z is None, headdim must be passed in
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    batch, seqlen, nheads = dt.shape[:3]
-    assert nheads % ngroups == 0
-    if z is not None:
-        dim = z.shape[-1]
-        assert dim % nheads == 0
-        headdim = dim // nheads
-    else:
-        if D.dim() == 1:
-            assert headdim is not None
-        else:
-            headdim = D.shape[1]
-        dim = nheads * headdim
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    dstate = (xBC.shape[-1] - dim) // ngroups // 2
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    return rearrange(out, "b s h p -> b s (h p)")
-class MambaSplitConv1dScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states=None,
-        seq_idx=None,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        activation="silu",
-        rmsnorm_weight=None,
-        rmsnorm_eps=1e-6,
-        outproj_weight=None,
-        outproj_bias=None,
-        headdim=None,
-        ngroups=1,
-        norm_before_gate=True,
-    ):
-        assert activation in [None, "silu", "swish"]
-        if D.dim() == 1:
-            assert headdim is not None
-            (nheads,) = D.shape
-        else:
-            nheads, headdim = D.shape
-        batch, seqlen, _ = zxbcdt.shape
-        dim = nheads * headdim
-        assert nheads % ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        assert zxbcdt.shape == (
-            batch,
-            seqlen,
-            2 * d_nonssm + 2 * dim + 2 * ngroups * dstate + nheads,
-        )
-        assert dt_bias.shape == (nheads,)
-        assert A.shape == (nheads,)
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + ngroups * dstate * 2, nheads], dim=-1
-        )
-        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ngroups * dstate, ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-        if rmsnorm_weight is None:
-            out, out_x, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            out = rearrange(out, "b s h p -> b s (h p)")
-            rstd = None
-            if d_nonssm > 0:
-                out = torch.cat([_swiglu_fwd(zx0), out], dim=-1)
-        else:
-            out_x, _, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            # reshape input data into 2D tensor
-            x_rms = rearrange(out_x, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            rmsnorm_weight = rmsnorm_weight.contiguous()
-            if d_nonssm == 0:
-                out = None
-            else:
-                out01 = torch.empty(
-                    (batch, seqlen, d_nonssm + dim),
-                    dtype=x_rms.dtype,
-                    device=x_rms.device,
-                )
-                out = rearrange(out01[..., d_nonssm:], "b s d -> (b s) d")
-                _swiglu_fwd(zx0, out=out01[..., :d_nonssm])
-            out, _, rstd = _layer_norm_fwd(
-                x_rms,
-                rmsnorm_weight,
-                None,
-                rmsnorm_eps,
-                z_rms,
-                out=out,
-                group_size=dim // ngroups,
-                norm_before_gate=norm_before_gate,
-                is_rms_norm=True,
-            )
-            if d_nonssm == 0:
-                out = rearrange(out, "(b s) d -> b s d", b=batch)
-            else:
-                out = out01
-        ctx.outproj_weight_dtype = (
-            outproj_weight.dtype if outproj_weight is not None else None
-        )
-        if outproj_weight is not None:
-            if torch.is_autocast_enabled():
-                dtype = torch.get_autocast_gpu_dtype()
-                out, outproj_weight = out.to(dtype), outproj_weight.to(dtype)
-                outproj_bias = (
-                    outproj_bias.to(dtype) if outproj_bias is not None else None
-                )
-            out = F.linear(out, outproj_weight, outproj_bias)
-        else:
-            assert outproj_bias is None
-        ctx.save_for_backward(
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out_x,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        )
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.activation = activation
-        ctx.rmsnorm_eps = rmsnorm_eps
-        ctx.norm_before_gate = norm_before_gate
-        ctx.chunk_size = chunk_size
-        ctx.headdim = headdim
-        ctx.ngroups = ngroups
-        return out if not return_final_states else (out, final_states)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        (
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        ) = ctx.saved_tensors
-        dfinal_states = args[0] if ctx.return_final_states else None
-        headdim = ctx.headdim
-        nheads = D.shape[0]
-        dim = nheads * headdim
-        assert nheads % ctx.ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ctx.ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ctx.ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        recompute_output = outproj_weight is not None
-        if recompute_output:
-            out_recompute = torch.empty(
-                *out.shape[:2], d_nonssm + dim, device=out.device, dtype=out.dtype
-            )
-            out0_recompute, out1_recompute = out_recompute.split(
-                [d_nonssm, dim], dim=-1
-            )
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        # Recompute x, B, C
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                ctx.activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ctx.ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dzxbcdt = torch.empty_like(zxbcdt)
-        dzx0, dz, dxBC_given, ddt_given = torch.split(
-            dzxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        dxBC = torch.empty_like(xBC)
-        dx, dB, dC = torch.split(
-            dxBC, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-        dx = rearrange(dx, "b l (h p) -> b l h p", h=nheads)
-        dB = rearrange(dB, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dC = rearrange(dC, "b l (g n) -> b l g n", g=ctx.ngroups)
-        if outproj_weight is not None:
-            dout_og = dout
-            dout = F.linear(dout, outproj_weight.t())
-        if d_nonssm > 0:
-            dout0, dout = dout.split([d_nonssm, dim], dim=-1)
-            _swiglu_bwd(zx0, dout0, dxy=dzx0, recompute_output=True, out=out0_recompute)
-        dout = rearrange(dout, "b s (h p) -> b s h p", p=headdim)
-        if rmsnorm_weight is None:
-            dz = rearrange(dz, "b l (h p) -> b l h p", h=nheads)
-            dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states, *rest = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                    dz=dz,
-                    recompute_output=recompute_output,
-                )
-            )
-            out_for_linear = (
-                rearrange(rest[0], "b s h p -> b s (h p)") if recompute_output else None
-            )
-            drmsnorm_weight = None
-        else:
-            batch = dout.shape[0]
-            dy_rms = rearrange(dout, "b s h p -> (b s) (h p)")
-            dz = rearrange(dz, "b l d -> (b l) d")
-            x_rms = rearrange(out, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            out1_recompute = (
-                rearrange(out1_recompute, "b s d -> (b s) d")
-                if recompute_output
-                else None
-            )
-            dout, drmsnorm_weight, _, dz, *rest = _layer_norm_bwd(
-                dy_rms,
-                x_rms,
-                rmsnorm_weight,
-                None,
-                ctx.rmsnorm_eps,
-                None,
-                rstd,
-                z_rms,
-                group_size=dim // ctx.ngroups,
-                norm_before_gate=ctx.norm_before_gate,
-                is_rms_norm=True,
-                recompute_output=recompute_output,
-                dz=dz,
-                out=out1_recompute if recompute_output else None,
-            )
-            out_for_linear = out_recompute if recompute_output else None
-            dout = rearrange(dout, "(b s) (h p) -> b s h p", b=batch, p=headdim)
-            dx, ddt, dA, dB, dC, dD, _, ddt_bias, dinitial_states = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                )
-            )
-        if outproj_weight is not None:
-            doutproj_weight = torch.einsum("bso,bsd->od", dout_og, out_for_linear)
-            doutproj_bias = (
-                dout_og.sum(dim=(0, 1)) if outproj_bias is not None else None
-            )
-        else:
-            doutproj_weight, doutproj_bias = None, None
-        dxBC_given = rearrange(dxBC_given, "b s d -> b d s")
-        dxBC_given, dweight, dbias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            rearrange(dxBC, "b s d -> b d s"),
-            seq_idx,
-            None,
-            None,
-            dxBC_given,
-            False,
-            ctx.activation in ["silu", "swish"],
-        )
-        dxBC_given = rearrange(dxBC_given, "b d s -> b s d")
-        return (
-            dzxbcdt,
-            dweight,
-            dbias,
-            ddt_bias,
-            dA,
-            dD,
-            None,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            drmsnorm_weight,
-            None,
-            doutproj_weight,
-            doutproj_bias,
-            None,
-            None,
-            None,
-        )
-def mamba_split_conv1d_scan_combined(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    initial_states=None,
-    seq_idx=None,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen), int32
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    return MambaSplitConv1dScanCombinedFn.apply(
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states,
-        seq_idx,
-        dt_limit,
-        return_final_states,
-        activation,
-        rmsnorm_weight,
-        rmsnorm_eps,
-        outproj_weight,
-        outproj_bias,
-        headdim,
-        ngroups,
-        norm_before_gate,
-    )
-def mamba_split_conv1d_scan_ref(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    if D.dim() == 1:
-        assert headdim is not None
-        (nheads,) = D.shape
-    else:
-        nheads, headdim = D.shape
-    assert nheads % ngroups == 0
-    batch, seqlen, _ = zxbcdt.shape
-    dim = nheads * headdim
-    dstate = (zxbcdt.shape[-1] - 2 * dim - nheads) // ngroups // 2
-    assert zxbcdt.shape == (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads)
-    assert dt_bias.shape == (nheads,)
-    assert A.shape == (nheads,)
-    if rmsnorm_weight is not None:
-        assert rmsnorm_weight.shape == (dim,)
-    z, xBC, dt = torch.split(zxbcdt, [dim, dim + 2 * ngroups * dstate, nheads], dim=-1)
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z if rmsnorm_weight is None else None,
-        dt_bias=dt_bias,
-        dt_softplus=True,
-        dt_limit=dt_limit,
-    )
-    out = rearrange(out, "b s h p -> b s (h p)")
-    if rmsnorm_weight is not None:
-        out = rmsnorm_fn(
-            out,
-            rmsnorm_weight,
-            None,
-            z=rearrange(z, "b l h p -> b l (h p)"),
-            eps=rmsnorm_eps,
-            norm_before_gate=norm_before_gate,
-        )
-    if outproj_weight is not None:
-        out = F.linear(out, outproj_weight, outproj_bias)
-    return out

build/torch25-cxx11-cu124-x86_64-linux/mamba_ssm/utils/__init__.py DELETED Viewed

File without changes

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-__version__ = "2.2.4"
-from .ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-from .modules.mamba_simple import Mamba
-from .modules.mamba2 import Mamba2
-from .models.mixer_seq_simple import MambaLMHeadModel
-__all__ = [
-    "selective_scan_fn",
-    "mamba_inner_fn",
-    "Mamba",
-    "Mamba2",
-    "MambaLMHeadModel",
-]

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/distributed/__init__.py DELETED Viewed

File without changes

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/distributed/tensor_parallel.py DELETED Viewed

@@ -1,326 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from ..utils.torch import custom_bwd, custom_fwd
-from einops import rearrange
-from ..distributed.distributed_utils import (
-    all_gather_raw,
-    all_reduce,
-    all_reduce_raw,
-    reduce_scatter,
-    reduce_scatter_raw,
-)
-class ParallelLinearFunc(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
-        """
-        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
-        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
-        """
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.process_group = process_group
-        ctx.sequence_parallel = sequence_parallel
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        if process_group is not None and sequence_parallel:
-            # We want to kick off the all_gather early, before weight dtype conversion
-            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-        else:
-            total_x = x
-        if torch.is_autocast_enabled():
-            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
-            bias = (
-                bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if bias is not None
-                else None
-            )
-        weight = weight.contiguous()
-        if process_group is not None and sequence_parallel:
-            handle_x.wait()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        output = F.linear(total_x, weight, bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        grad_output = grad_output.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-            else:
-                total_x = x
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            grad_input = F.linear(grad_output, weight.t())
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(
-                    grad_input, process_group, async_op=True
-                )
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            if process_group is not None and sequence_parallel:
-                handle_x.wait()
-            grad_weight = torch.einsum(
-                "bo,bi->oi", grad_output, total_x.reshape(batch_dim, total_x.shape[-1])
-            )
-        else:
-            grad_weight = None
-        grad_bias = grad_output.sum(dim=0) if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None
-def parallel_linear_func(
-    x: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    process_group: Optional[ProcessGroup] = None,
-    sequence_parallel: bool = True,
-):
-    return ParallelLinearFunc.apply(x, weight, bias, process_group, sequence_parallel)
-class ColumnParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        if out_features % multiple_of:
-            raise ValueError(
-                f"out_features ({out_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = out_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        super().__init__(
-            in_features,
-            local_multiple * multiple_of,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        return parallel_linear_func(
-            x,
-            self.weight,
-            self.bias,
-            process_group=self.process_group,
-            sequence_parallel=self.sequence_parallel,
-        )
-class RowParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        rank = torch.distributed.get_rank(process_group)
-        if in_features % multiple_of:
-            raise ValueError(
-                f"in_features ({in_features}) must be a multiple of {multiple_of}"
-            )
-        multiple = in_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        # Only rank 0 will have bias
-        super().__init__(
-            local_multiple * multiple_of,
-            out_features,
-            bias=bias and rank == 0,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        """
-        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
-        a reduce_scatter of the result.
-        """
-        out = parallel_linear_func(x, self.weight, self.bias)
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return reduce_fn(out, self.process_group)
-class VocabParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if num_embeddings % world_size != 0:
-                raise ValueError(
-                    f"num_embeddings ({num_embeddings}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-            if world_size > 1 and padding_idx is not None:
-                raise RuntimeError("ParallelEmbedding does not support padding_idx")
-        else:
-            world_size = 1
-        super().__init__(
-            num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs
-        )
-    def forward(self, input: Tensor) -> Tensor:
-        if self.process_group is None:
-            return super().forward(input)
-        else:
-            rank = torch.distributed.get_rank(self.process_group)
-            vocab_size = self.num_embeddings
-            vocab_start_index, vocab_end_index = (
-                rank * vocab_size,
-                (rank + 1) * vocab_size,
-            )
-            # Create a mask of valid vocab ids (1 means it needs to be masked).
-            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
-            input = input - vocab_start_index
-            input[input_ids_mask] = 0
-            embeddings = super().forward(input)
-            embeddings[input_ids_mask] = 0.0
-            return embeddings
-class ColumnParallelEmbedding(nn.Embedding):
-    def __init__(
-        self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs
-    ):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if embedding_dim % world_size != 0:
-                raise ValueError(
-                    f"embedding_dim ({embedding_dim}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-        else:
-            world_size = 1
-        super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs)
-class ParallelEmbeddings(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        vocab_size,
-        max_position_embeddings,
-        process_group,
-        padding_idx=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        """
-        If max_position_embeddings <= 0, there's no position embeddings
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.word_embeddings = VocabParallelEmbedding(
-            vocab_size,
-            embed_dim,
-            padding_idx=padding_idx,
-            process_group=process_group,
-            **factory_kwargs,
-        )
-        self.max_position_embeddings = max_position_embeddings
-        if self.max_position_embeddings > 0:
-            self.position_embeddings = ColumnParallelEmbedding(
-                max_position_embeddings,
-                embed_dim,
-                process_group=process_group,
-                **factory_kwargs,
-            )
-    def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False):
-        """
-        input_ids: (batch, seqlen)
-        position_ids: (batch, seqlen)
-        """
-        batch_size, seqlen = input_ids.shape
-        world_size = torch.distributed.get_world_size(self.process_group)
-        embeddings = self.word_embeddings(input_ids)
-        if self.max_position_embeddings > 0:
-            if position_ids is None:
-                position_ids = torch.arange(
-                    seqlen, dtype=torch.long, device=input_ids.device
-                )
-            position_embeddings = self.position_embeddings(position_ids)
-            if world_size <= 1:
-                embeddings = embeddings + position_embeddings
-            else:
-                partition_dim = self.position_embeddings.embedding_dim
-                rank = torch.distributed.get_rank(self.process_group)
-                embeddings[
-                    ..., rank * partition_dim : (rank + 1) * partition_dim
-                ] += position_embeddings
-        if combine_batch_seqlen_dim:
-            embeddings = rearrange(embeddings, "b s d -> (b s) d")
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return (
-            embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group)
-        )

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/models/__init__.py DELETED Viewed

File without changes

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/models/mixer_seq_simple.py DELETED Viewed

@@ -1,338 +0,0 @@
-# Copyright (c) 2023, Albert Gu, Tri Dao.
-import math
-from functools import partial
-import json
-import os
-import copy
-from collections import namedtuple
-import torch
-import torch.nn as nn
-from .config_mamba import MambaConfig
-from ..modules.mamba_simple import Mamba
-from ..modules.mamba2 import Mamba2
-from ..modules.mha import MHA
-from ..modules.mlp import GatedMLP
-from ..modules.block import Block
-from ..utils.generation import GenerationMixin
-from ..utils.hf import load_config_hf, load_state_dict_hf
-try:
-    from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-def create_block(
-    d_model,
-    d_intermediate,
-    ssm_cfg=None,
-    attn_layer_idx=None,
-    attn_cfg=None,
-    norm_epsilon=1e-5,
-    rms_norm=False,
-    residual_in_fp32=False,
-    fused_add_norm=False,
-    layer_idx=None,
-    device=None,
-    dtype=None,
-):
-    if ssm_cfg is None:
-        ssm_cfg = {}
-    if attn_layer_idx is None:
-        attn_layer_idx = []
-    if attn_cfg is None:
-        attn_cfg = {}
-    factory_kwargs = {"device": device, "dtype": dtype}
-    if layer_idx not in attn_layer_idx:
-        # Create a copy of the config to modify
-        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
-        ssm_layer = ssm_cfg.pop("layer", "Mamba1")
-        if ssm_layer not in ["Mamba1", "Mamba2"]:
-            raise ValueError(
-                f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2"
-            )
-        mixer_cls = partial(
-            Mamba2 if ssm_layer == "Mamba2" else Mamba,
-            layer_idx=layer_idx,
-            **ssm_cfg,
-            **factory_kwargs,
-        )
-    else:
-        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
-    norm_cls = partial(
-        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
-    )
-    if d_intermediate == 0:
-        mlp_cls = nn.Identity
-    else:
-        mlp_cls = partial(
-            GatedMLP,
-            hidden_features=d_intermediate,
-            out_features=d_model,
-            **factory_kwargs,
-        )
-    block = Block(
-        d_model,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        fused_add_norm=fused_add_norm,
-        residual_in_fp32=residual_in_fp32,
-    )
-    block.layer_idx = layer_idx
-    return block
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    if isinstance(module, nn.Linear):
-        if module.bias is not None:
-            if not getattr(module.bias, "_no_reinit", False):
-                nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                # We need to reinit p since this code could be called multiple times
-                # Having just p *= scale would repeatedly scale it down
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                with torch.no_grad():
-                    p /= math.sqrt(n_residuals_per_layer * n_layer)
-class MixerModel(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        d_intermediate: int,
-        vocab_size: int,
-        ssm_cfg=None,
-        attn_layer_idx=None,
-        attn_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    d_intermediate=d_intermediate,
-                    ssm_cfg=ssm_cfg,
-                    attn_layer_idx=attn_layer_idx,
-                    attn_cfg=attn_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-                n_residuals_per_layer=(
-                    1 if d_intermediate == 0 else 2
-                ),  # 2 if we have MLP
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(
-                batch_size, max_seqlen, dtype=dtype, **kwargs
-            )
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
-        hidden_states = self.embedding(input_ids)
-        residual = None
-        for layer in self.layers:
-            hidden_states, residual = layer(
-                hidden_states,
-                residual,
-                inference_params=inference_params,
-                **mixer_kwargs,
-            )
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            # Set prenorm=False here since we don't need the residual
-            hidden_states = layer_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm_f, RMSNorm),
-            )
-        return hidden_states
-class MambaLMHeadModel(nn.Module, GenerationMixin):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        d_intermediate = config.d_intermediate
-        vocab_size = config.vocab_size
-        ssm_cfg = config.ssm_cfg
-        attn_layer_idx = config.attn_layer_idx
-        attn_cfg = config.attn_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.backbone = MixerModel(
-            d_model=d_model,
-            n_layer=n_layer,
-            d_intermediate=d_intermediate,
-            vocab_size=vocab_size,
-            ssm_cfg=ssm_cfg,
-            attn_layer_idx=attn_layer_idx,
-            attn_cfg=attn_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        if self.config.tie_embeddings:
-            self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        **mixer_kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        hidden_states = self.backbone(
-            input_ids, inference_params=inference_params, **mixer_kwargs
-        )
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(
-            load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype)
-        )
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.__dict__, f, indent=4)

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/modules/__init__.py DELETED Viewed

File without changes

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/__init__.py DELETED Viewed

File without changes

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/selective_scan_interface.py DELETED Viewed

@@ -1,659 +0,0 @@
-# Copyright (c) 2023, Tri Dao, Albert Gu.
-import torch
-import torch.nn.functional as F
-from ..utils.torch import custom_fwd, custom_bwd
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn = None
-    causal_conv1d_cuda = None
-from .triton.layer_norm import _layer_norm_fwd
-from .._ops import ops
-class SelectiveScanFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        u,
-        delta,
-        A,
-        B,
-        C,
-        D=None,
-        z=None,
-        delta_bias=None,
-        delta_softplus=False,
-        return_last_state=False,
-    ):
-        if u.stride(-1) != 1:
-            u = u.contiguous()
-        if delta.stride(-1) != 1:
-            delta = delta.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if C.stride(-1) != 1:
-            C = C.contiguous()
-        if z is not None and z.stride(-1) != 1:
-            z = z.contiguous()
-        if B.dim() == 3:
-            B = rearrange(B, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_B = True
-        if C.dim() == 3:
-            C = rearrange(C, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_C = True
-        out, x, *rest = ops.selective_scan_fwd(
-            u, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.has_z = z is not None
-        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
-        if not ctx.has_z:
-            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
-            return out if not return_last_state else (out, last_state)
-        else:
-            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
-            out_z = rest[0]
-            return out_z if not return_last_state else (out_z, last_state)
-    @staticmethod
-    def backward(ctx, dout, *args):
-        if not ctx.has_z:
-            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
-            z = None
-            out = None
-        else:
-            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        # Here we just pass in None and dz will be allocated in the C++ code.
-        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = ops.selective_scan_bwd(
-            u,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            z,
-            delta_bias,
-            dout,
-            x,
-            out,
-            None,
-            ctx.delta_softplus,
-            False,  # option to recompute out_z, not used here
-        )
-        dz = rest[0] if ctx.has_z else None
-        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
-        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
-        return (
-            du,
-            ddelta,
-            dA,
-            dB,
-            dC,
-            dD if D is not None else None,
-            dz,
-            ddelta_bias if delta_bias is not None else None,
-            None,
-            None,
-        )
-def rms_norm_forward(
-    x,
-    weight,
-    bias,
-    eps=1e-6,
-    is_rms_norm=True,
-):
-    # x (b l) d
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    weight = weight.contiguous()
-    if bias is not None:
-        bias = bias.contiguous()
-    y = _layer_norm_fwd(
-        x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm
-    )[0]
-    # y (b l) d
-    return y
-def selective_scan_fn(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """if return_last_state is True, returns (out, last_state)
-    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
-    not considered in the backward pass.
-    """
-    return SelectiveScanFn.apply(
-        u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state
-    )
-def selective_scan_ref(
-    u,
-    delta,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    delta_bias=None,
-    delta_softplus=False,
-    return_last_state=False,
-):
-    """
-    u: r(B D L)
-    delta: r(B D L)
-    A: c(D N) or r(D N)
-    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    D: r(D)
-    z: r(B D L)
-    delta_bias: r(D), fp32
-    out: r(B D L)
-    last_state (optional): r(B D dstate) or c(B D dstate)
-    """
-    dtype_in = u.dtype
-    u = u.float()
-    delta = delta.float()
-    if delta_bias is not None:
-        delta = delta + delta_bias[..., None].float()
-    if delta_softplus:
-        delta = F.softplus(delta)
-    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
-    is_variable_B = B.dim() >= 3
-    is_variable_C = C.dim() >= 3
-    if A.is_complex():
-        if is_variable_B:
-            B = torch.view_as_complex(
-                rearrange(B.float(), "... (L two) -> ... L two", two=2)
-            )
-        if is_variable_C:
-            C = torch.view_as_complex(
-                rearrange(C.float(), "... (L two) -> ... L two", two=2)
-            )
-    else:
-        B = B.float()
-        C = C.float()
-    x = A.new_zeros((batch, dim, dstate))
-    ys = []
-    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
-    if not is_variable_B:
-        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
-    else:
-        if B.dim() == 3:
-            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
-        else:
-            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
-    if is_variable_C and C.dim() == 4:
-        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
-    last_state = None
-    for i in range(u.shape[2]):
-        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
-        if not is_variable_C:
-            y = torch.einsum("bdn,dn->bd", x, C)
-        else:
-            if C.dim() == 3:
-                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
-            else:
-                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
-        if i == u.shape[2] - 1:
-            last_state = x
-        if y.is_complex():
-            y = y.real * 2
-        ys.append(y)
-    y = torch.stack(ys, dim=2)  # (batch dim L)
-    out = y if D is None else y + u * rearrange(D, "d -> d 1")
-    if z is not None:
-        out = out * F.silu(z)
-    out = out.to(dtype=dtype_in)
-    return out if not return_last_state else (out, last_state)
-class MambaInnerFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B=None,
-        C=None,
-        D=None,
-        delta_bias=None,
-        B_proj_bias=None,
-        C_proj_bias=None,
-        delta_softplus=True,
-        checkpoint_lvl=1,
-        b_rms_weight=None,
-        c_rms_weight=None,
-        dt_rms_weight=None,
-        b_c_dt_rms_eps=1e-6,
-    ):
-        """
-        xz: (batch, dim, seqlen)
-        """
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        assert checkpoint_lvl in [0, 1]
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        if torch.is_autocast_enabled():
-            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            delta_proj_weight = delta_proj_weight.to(
-                dtype=torch.get_autocast_gpu_dtype()
-            )
-            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            out_proj_bias = (
-                out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
-                if out_proj_bias is not None
-                else None
-            )
-        if xz.stride(-1) != 1:
-            xz = xz.contiguous()
-        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
-        x, z = xz.chunk(2, dim=1)
-        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
-        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-            x, conv1d_weight, conv1d_bias, None, None, None, True
-        )
-        # We're being very careful here about the layout, to avoid extra transposes.
-        # We want delta to have d as the slowest moving dimension
-        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-        x_dbl = F.linear(
-            rearrange(conv1d_out, "b d l -> (b l) d"), x_proj_weight
-        )  # (bl d)
-        delta = rearrange(
-            delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-        )
-        ctx.is_variable_B = B is None
-        ctx.is_variable_C = C is None
-        ctx.B_proj_bias_is_None = B_proj_bias is None
-        ctx.C_proj_bias_is_None = C_proj_bias is None
-        if B is None:  # variable B
-            B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl dstate)
-            if B_proj_bias is not None:
-                B = B + B_proj_bias.to(dtype=B.dtype)
-            if not A.is_complex():
-                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                B = rearrange(
-                    B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if B.stride(-1) != 1:
-                B = B.contiguous()
-        if C is None:  # variable C
-            C = x_dbl[:, -d_state:]  # (bl dstate)
-            if C_proj_bias is not None:
-                C = C + C_proj_bias.to(dtype=C.dtype)
-            if not A.is_complex():
-                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                C = rearrange(
-                    C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2
-                ).contiguous()
-        else:
-            if C.stride(-1) != 1:
-                C = C.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if b_rms_weight is not None:
-            B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if c_rms_weight is not None:
-            C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if dt_rms_weight is not None:
-            delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-            delta = rms_norm_forward(
-                delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps
-            )
-            delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-        out, scan_intermediates, out_z = ops.selective_scan_fwd(
-            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.out_proj_bias_is_None = out_proj_bias is None
-        ctx.checkpoint_lvl = checkpoint_lvl
-        ctx.b_rms_weight = b_rms_weight
-        ctx.c_rms_weight = c_rms_weight
-        ctx.dt_rms_weight = dt_rms_weight
-        ctx.b_c_dt_rms_eps = b_c_dt_rms_eps
-        if (
-            checkpoint_lvl >= 1
-        ):  # Will recompute conv1d_out and delta in the backward pass
-            conv1d_out, delta = None, None
-        ctx.save_for_backward(
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        )
-        return F.linear(
-            rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias
-        )
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout):
-        # dout: (batch, seqlen, dim)
-        assert (
-            causal_conv1d_cuda is not None
-        ), "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        (
-            xz,
-            conv1d_weight,
-            conv1d_bias,
-            x_dbl,
-            x_proj_weight,
-            delta_proj_weight,
-            out_proj_weight,
-            conv1d_out,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            delta_bias,
-            scan_intermediates,
-            b_rms_weight,
-            c_rms_weight,
-            dt_rms_weight,
-            out,
-        ) = ctx.saved_tensors
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        x, z = xz.chunk(2, dim=1)
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        if ctx.checkpoint_lvl == 1:
-            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-                x, conv1d_weight, conv1d_bias, None, None, None, True
-            )
-            delta = rearrange(
-                delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L
-            )
-            if dt_rms_weight is not None:
-                delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-                delta = rms_norm_forward(
-                    delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps
-                )
-                delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-            if b_rms_weight is not None:
-                # Recompute & RMSNorm B
-                B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                B = rms_norm_forward(B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps)
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            if c_rms_weight is not None:
-                # Recompute & RMSNorm C
-                C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                C = rms_norm_forward(C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps)
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
-        dx, dz = dxz.chunk(2, dim=1)
-        dout = rearrange(dout, "b l e -> e (b l)")
-        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
-        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = (
-            ops.selective_scan_bwd(
-                conv1d_out,
-                delta,
-                A,
-                B,
-                C,
-                D,
-                z,
-                delta_bias,
-                dout_y,
-                scan_intermediates,
-                out,
-                dz,
-                ctx.delta_softplus,
-                True,  # option to recompute out_z
-            )
-        )
-        dout_proj_weight = torch.einsum(
-            "eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)")
-        )
-        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
-        dD = dD if D is not None else None
-        dx_dbl = torch.empty_like(x_dbl)
-        dB_proj_bias = None
-        if ctx.is_variable_B:
-            if not A.is_complex():
-                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dB = rearrange(
-                    dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
-            dx_dbl[:, delta_rank : delta_rank + d_state] = dB  # (bl d)
-            dB = None
-        dC_proj_bias = None
-        if ctx.is_variable_C:
-            if not A.is_complex():
-                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dC = rearrange(
-                    dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2
-                ).contiguous()
-            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
-            dx_dbl[:, -d_state:] = dC  # (bl d)
-            dC = None
-        ddelta = rearrange(ddelta, "b d l -> d (b l)")
-        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
-        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
-        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
-        dx_proj_weight = torch.einsum(
-            "Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d")
-        )
-        dconv1d_out = torch.addmm(
-            dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out
-        )
-        dconv1d_out = rearrange(
-            dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1]
-        )
-        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
-        # backward of conv1d with the backward of chunk).
-        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            x,
-            conv1d_weight,
-            conv1d_bias,
-            dconv1d_out,
-            None,
-            None,
-            None,
-            dx,
-            False,
-            True,
-        )
-        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
-        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
-        return (
-            dxz,
-            dconv1d_weight,
-            dconv1d_bias,
-            dx_proj_weight,
-            ddelta_proj_weight,
-            dout_proj_weight,
-            dout_proj_bias,
-            dA,
-            dB,
-            dC,
-            dD,
-            ddelta_bias if delta_bias is not None else None,
-            # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps
-            dB_proj_bias,
-            dC_proj_bias,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_inner_fn(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-    checkpoint_lvl=1,
-    b_rms_weight=None,
-    c_rms_weight=None,
-    dt_rms_weight=None,
-    b_c_dt_rms_eps=1e-6,
-):
-    return MambaInnerFn.apply(
-        xz,
-        conv1d_weight,
-        conv1d_bias,
-        x_proj_weight,
-        delta_proj_weight,
-        out_proj_weight,
-        out_proj_bias,
-        A,
-        B,
-        C,
-        D,
-        delta_bias,
-        B_proj_bias,
-        C_proj_bias,
-        delta_softplus,
-        checkpoint_lvl,
-        b_rms_weight,
-        c_rms_weight,
-        dt_rms_weight,
-        b_c_dt_rms_eps,
-    )
-def mamba_inner_ref(
-    xz,
-    conv1d_weight,
-    conv1d_bias,
-    x_proj_weight,
-    delta_proj_weight,
-    out_proj_weight,
-    out_proj_bias,
-    A,
-    B=None,
-    C=None,
-    D=None,
-    delta_bias=None,
-    B_proj_bias=None,
-    C_proj_bias=None,
-    delta_softplus=True,
-):
-    assert (
-        causal_conv1d_fn is not None
-    ), "causal_conv1d_fn is not available. Please install causal-conv1d."
-    L = xz.shape[-1]
-    delta_rank = delta_proj_weight.shape[1]
-    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-    x, z = xz.chunk(2, dim=1)
-    x = causal_conv1d_fn(
-        x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, activation="silu"
-    )
-    # We're being very careful here about the layout, to avoid extra transposes.
-    # We want delta to have d as the slowest moving dimension
-    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-    x_dbl = F.linear(rearrange(x, "b d l -> (b l) d"), x_proj_weight)  # (bl d)
-    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
-    delta = rearrange(delta, "d (b l) -> b d l", l=L)
-    if B is None:  # variable B
-        B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl d)
-        if B_proj_bias is not None:
-            B = B + B_proj_bias.to(dtype=B.dtype)
-        if not A.is_complex():
-            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            B = rearrange(
-                B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    if C is None:  # variable B
-        C = x_dbl[:, -d_state:]  # (bl d)
-        if C_proj_bias is not None:
-            C = C + C_proj_bias.to(dtype=C.dtype)
-        if not A.is_complex():
-            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            C = rearrange(
-                C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2
-            ).contiguous()
-    y = selective_scan_fn(
-        x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True
-    )
-    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/__init__.py DELETED Viewed

File without changes

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/layer_norm.py DELETED Viewed

@@ -1,1166 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# Implement dropout + residual + layer_norm / rms_norm.
-# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
-# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
-# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
-import math
-import warnings
-import torch
-import torch.nn.functional as F
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-def layer_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    out = F.layer_norm(
-        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
-    ).to(dtype)
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = F.layer_norm(
-            x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def rms_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
-        dtype
-    )
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = (
-            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def config_prune(configs):
-    if torch.version.hip:
-        try:
-            # set warp size based on gcn architecure
-            gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
-            if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
-                # radeon
-                warp_size = 32
-            else:
-                # instinct
-                warp_size = 64
-        except AttributeError as e:
-            # fall back to crude method to set warp size
-            device_name = torch.cuda.get_device_properties(0).name
-            if "instinct" in device_name.lower():
-                warp_size = 64
-            else:
-                warp_size = 32
-            warnings.warn(
-                f"{e}, warp size set to {warp_size} based on device name: {device_name}",
-                UserWarning,
-            )
-    else:
-        # cuda
-        warp_size = 32
-    max_block_sz = 1024
-    max_num_warps = max_block_sz // warp_size
-    pruned_configs = [config for config in configs if config.num_warps <= max_num_warps]
-    return pruned_configs
-configs_autotune = [
-    triton.Config({}, num_warps=1),
-    triton.Config({}, num_warps=2),
-    triton.Config({}, num_warps=4),
-    triton.Config({}, num_warps=8),
-    triton.Config({}, num_warps=16),
-    triton.Config({}, num_warps=32),
-]
-pruned_configs_autotune = config_prune(configs_autotune)
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
-@triton.jit
-def _layer_norm_fwd_1pass_kernel(
-    X,  # pointer to the input
-    Y,  # pointer to the output
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    RESIDUAL,  # pointer to the residual
-    X1,
-    W1,
-    B1,
-    Y1,
-    RESIDUAL_OUT,  # pointer to the residual
-    ROWSCALE,
-    SEEDS,  # Dropout seeds for each row
-    DROPOUT_MASK,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_res_row,
-    stride_res_out_row,
-    stride_x1_row,
-    stride_y1_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,  # Dropout probability
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_RESIDUAL: tl.constexpr,
-    STORE_RESIDUAL_OUT: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    STORE_DROPOUT_MASK: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_X1: tl.constexpr,
-    HAS_W1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    X += row * stride_x_row
-    Y += row * stride_y_row
-    if HAS_RESIDUAL:
-        RESIDUAL += row * stride_res_row
-    if STORE_RESIDUAL_OUT:
-        RESIDUAL_OUT += row * stride_res_out_row
-    if HAS_X1:
-        X1 += row * stride_x1_row
-    if HAS_W1:
-        Y1 += row * stride_y1_row
-    # Compute mean and variance
-    cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
-    if HAS_ROWSCALE:
-        rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-        x *= rowscale
-    if HAS_DROPOUT:
-        # Compute dropout mask
-        # 7 rounds is good enough, and reduces register pressure
-        keep_mask = (
-            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-        )
-        x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
-        if STORE_DROPOUT_MASK:
-            tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
-    if HAS_X1:
-        x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
-            x1 *= rowscale
-        if HAS_DROPOUT:
-            # Compute dropout mask
-            # 7 rounds is good enough, and reduces register pressure
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
-            if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
-        x += x1
-    if HAS_RESIDUAL:
-        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
-        x += residual
-    if STORE_RESIDUAL_OUT:
-        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
-    if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    else:
-        xbar = tl.where(cols < N, x, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
-    # Write output
-    tl.store(Y + cols, y, mask=mask)
-    if HAS_W1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if HAS_B1:
-            b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
-        y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
-        tl.store(Y1 + cols, y1, mask=mask)
-def _layer_norm_fwd(
-    x,
-    weight,
-    bias,
-    eps,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    dropout_p=0.0,
-    rowscale=None,
-    out_dtype=None,
-    residual_dtype=None,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    if residual is not None:
-        residual_dtype = residual.dtype
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    if residual is not None:
-        assert residual.stride(-1) == 1
-        assert residual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if x1 is not None:
-        assert x1.shape == x.shape
-        assert rowscale is None
-        assert x1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
-    assert y.stride(-1) == 1
-    if weight1 is not None:
-        y1 = torch.empty_like(y)
-        assert y1.stride(-1) == 1
-    else:
-        y1 = None
-    if (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        residual_out = torch.empty(
-            M,
-            N,
-            device=x.device,
-            dtype=residual_dtype if residual_dtype is not None else x.dtype,
-        )
-        assert residual_out.stride(-1) == 1
-    else:
-        residual_out = None
-    mean = (
-        torch.empty((M,), dtype=torch.float32, device=x.device)
-        if not is_rms_norm
-        else None
-    )
-    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
-    if dropout_p > 0.0:
-        seeds = torch.randint(
-            2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
-        )
-    else:
-        seeds = None
-    if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(
-            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
-        )
-    else:
-        dropout_mask = None
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[(M,)](
-            x,
-            y,
-            weight,
-            bias,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            y1,
-            residual_out,
-            rowscale,
-            seeds,
-            dropout_mask,
-            mean,
-            rstd,
-            x.stride(0),
-            y.stride(0),
-            residual.stride(0) if residual is not None else 0,
-            residual_out.stride(0) if residual_out is not None else 0,
-            x1.stride(0) if x1 is not None else 0,
-            y1.stride(0) if y1 is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            is_rms_norm,
-            BLOCK_N,
-            residual is not None,
-            residual_out is not None,
-            bias is not None,
-            dropout_p > 0.0,
-            dropout_mask is not None,
-            rowscale is not None,
-        )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if dropout_mask is not None and x1 is not None:
-        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
-    else:
-        dropout_mask1 = None
-    return (
-        y,
-        y1,
-        mean,
-        rstd,
-        residual_out if residual_out is not None else x,
-        seeds,
-        dropout_mask,
-        dropout_mask1,
-    )
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=[
-        "N",
-        "HAS_DRESIDUAL",
-        "STORE_DRESIDUAL",
-        "IS_RMS_NORM",
-        "HAS_BIAS",
-        "HAS_DROPOUT",
-    ],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
-# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
-@triton.jit
-def _layer_norm_bwd_kernel(
-    X,  # pointer to the input
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    Y,  # pointer to the output to be recomputed
-    DY,  # pointer to the output gradient
-    DX,  # pointer to the input gradient
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    DRESIDUAL,
-    W1,
-    DY1,
-    DX1,
-    DW1,
-    DB1,
-    DRESIDUAL_IN,
-    ROWSCALE,
-    SEEDS,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_dy_row,
-    stride_dx_row,
-    stride_dres_row,
-    stride_dy1_row,
-    stride_dx1_row,
-    stride_dres_in_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,
-    rows_per_program,
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_DRESIDUAL: tl.constexpr,
-    STORE_DRESIDUAL: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_DY1: tl.constexpr,
-    HAS_DX1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-    RECOMPUTE_OUTPUT: tl.constexpr,
-):
-    # Map the program id to the elements of X, DX, and DY it should compute.
-    row_block_id = tl.program_id(0)
-    row_start = row_block_id * rows_per_program
-    # Do not early exit if row_start >= M, because we need to write DW and DB
-    cols = tl.arange(0, BLOCK_N)
-    mask = cols < N
-    X += row_start * stride_x_row
-    if HAS_DRESIDUAL:
-        DRESIDUAL += row_start * stride_dres_row
-    if STORE_DRESIDUAL:
-        DRESIDUAL_IN += row_start * stride_dres_in_row
-    DY += row_start * stride_dy_row
-    DX += row_start * stride_dx_row
-    if HAS_DY1:
-        DY1 += row_start * stride_dy1_row
-    if HAS_DX1:
-        DX1 += row_start * stride_dx1_row
-    if RECOMPUTE_OUTPUT:
-        Y += row_start * stride_y_row
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if RECOMPUTE_OUTPUT and HAS_BIAS:
-        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
-    if HAS_DY1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_BIAS:
-        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_DY1:
-        dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-        if HAS_B1:
-            db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    row_end = min((row_block_id + 1) * rows_per_program, M)
-    for row in range(row_start, row_end):
-        # Load data to SRAM
-        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
-        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
-        if HAS_DY1:
-            dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
-        if not IS_RMS_NORM:
-            mean = tl.load(Mean + row)
-        rstd = tl.load(Rstd + row)
-        # Compute dx
-        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-        xhat = tl.where(mask, xhat, 0.0)
-        if RECOMPUTE_OUTPUT:
-            y = xhat * w + b if HAS_BIAS else xhat * w
-            tl.store(Y + cols, y, mask=mask)
-        wdy = w * dy
-        dw += dy * xhat
-        if HAS_BIAS:
-            db += dy
-        if HAS_DY1:
-            wdy += w1 * dy1
-            dw1 += dy1 * xhat
-            if HAS_B1:
-                db1 += dy1
-        if not IS_RMS_NORM:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            c2 = tl.sum(wdy, axis=0) / N
-            dx = (wdy - (xhat * c1 + c2)) * rstd
-        else:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            dx = (wdy - xhat * c1) * rstd
-        if HAS_DRESIDUAL:
-            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
-            dx += dres
-        # Write dx
-        if STORE_DRESIDUAL:
-            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
-        if HAS_DX1:
-            if HAS_DROPOUT:
-                keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                    > dropout_p
-                )
-                dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-            else:
-                dx1 = dx
-            tl.store(DX1 + cols, dx1, mask=mask)
-        if HAS_DROPOUT:
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
-            dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-            dx *= rowscale
-        tl.store(DX + cols, dx, mask=mask)
-        X += stride_x_row
-        if HAS_DRESIDUAL:
-            DRESIDUAL += stride_dres_row
-        if STORE_DRESIDUAL:
-            DRESIDUAL_IN += stride_dres_in_row
-        if RECOMPUTE_OUTPUT:
-            Y += stride_y_row
-        DY += stride_dy_row
-        DX += stride_dx_row
-        if HAS_DY1:
-            DY1 += stride_dy1_row
-        if HAS_DX1:
-            DX1 += stride_dx1_row
-    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
-    if HAS_BIAS:
-        tl.store(DB + row_block_id * N + cols, db, mask=mask)
-    if HAS_DY1:
-        tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
-        if HAS_B1:
-            tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
-def _layer_norm_bwd(
-    dy,
-    x,
-    weight,
-    bias,
-    eps,
-    mean,
-    rstd,
-    dresidual=None,
-    dy1=None,
-    weight1=None,
-    bias1=None,
-    seeds=None,
-    dropout_p=0.0,
-    rowscale=None,
-    has_residual=False,
-    has_x1=False,
-    is_rms_norm=False,
-    x_dtype=None,
-    recompute_output=False,
-):
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    assert dy.stride(-1) == 1
-    assert dy.shape == (M, N)
-    if dresidual is not None:
-        assert dresidual.stride(-1) == 1
-        assert dresidual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if dy1 is not None:
-        assert weight1 is not None
-        assert dy1.shape == dy.shape
-        assert dy1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if seeds is not None:
-        assert seeds.is_contiguous()
-        assert seeds.shape == (M if not has_x1 else M * 2,)
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    dx = (
-        torch.empty_like(x)
-        if x_dtype is None
-        else torch.empty(M, N, dtype=x_dtype, device=x.device)
-    )
-    dresidual_in = (
-        torch.empty_like(x)
-        if has_residual
-        and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
-        else None
-    )
-    dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = (
-        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
-        if recompute_output
-        else None
-    )
-    if recompute_output:
-        assert (
-            weight1 is None
-        ), "recompute_output is not supported with parallel LayerNorm"
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
-    _db = (
-        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
-        if bias is not None
-        else None
-    )
-    _dw1 = torch.empty_like(_dw) if weight1 is not None else None
-    _db1 = torch.empty_like(_db) if bias1 is not None else None
-    rows_per_program = math.ceil(M / sm_count)
-    grid = (sm_count,)
-    with torch.cuda.device(x.device.index):
-        _layer_norm_bwd_kernel[grid](
-            x,
-            weight,
-            bias,
-            y,
-            dy,
-            dx,
-            _dw,
-            _db,
-            dresidual,
-            weight1,
-            dy1,
-            dx1,
-            _dw1,
-            _db1,
-            dresidual_in,
-            rowscale,
-            seeds,
-            mean,
-            rstd,
-            x.stride(0),
-            0 if not recompute_output else y.stride(0),
-            dy.stride(0),
-            dx.stride(0),
-            dresidual.stride(0) if dresidual is not None else 0,
-            dy1.stride(0) if dy1 is not None else 0,
-            dx1.stride(0) if dx1 is not None else 0,
-            dresidual_in.stride(0) if dresidual_in is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            rows_per_program,
-            is_rms_norm,
-            BLOCK_N,
-            dresidual is not None,
-            dresidual_in is not None,
-            bias is not None,
-            dropout_p > 0.0,
-        )
-    dw = _dw.sum(0).to(weight.dtype)
-    db = _db.sum(0).to(bias.dtype) if bias is not None else None
-    dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
-    db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return (
-        (dx, dw, db, dresidual_in, dx1, dw1, db1)
-        if not recompute_output
-        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
-    )
-class LayerNormFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        weight,
-        bias,
-        residual=None,
-        x1=None,
-        weight1=None,
-        bias1=None,
-        eps=1e-6,
-        dropout_p=0.0,
-        rowscale=None,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-        return_dropout_mask=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        if x1 is not None:
-            assert x1.shape == x_shape_og
-            assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = x1.reshape(-1, x1.shape[-1])
-            if x1.stride(-1) != 1:
-                x1 = x1.contiguous()
-        weight = weight.contiguous()
-        if bias is not None:
-            bias = bias.contiguous()
-        if weight1 is not None:
-            weight1 = weight1.contiguous()
-        if bias1 is not None:
-            bias1 = bias1.contiguous()
-        if rowscale is not None:
-            rowscale = rowscale.reshape(-1).contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
-            _layer_norm_fwd(
-                x,
-                weight,
-                bias,
-                eps,
-                residual,
-                x1,
-                weight1,
-                bias1,
-                dropout_p=dropout_p,
-                rowscale=rowscale,
-                residual_dtype=residual_dtype,
-                is_rms_norm=is_rms_norm,
-                return_dropout_mask=return_dropout_mask,
-            )
-        )
-        ctx.save_for_backward(
-            residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.dropout_p = dropout_p
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.has_x1 = x1 is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        y = y.reshape(x_shape_og)
-        y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = (
-            residual_out.reshape(x_shape_og) if residual_out is not None else None
-        )
-        dropout_mask = (
-            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        )
-        dropout_mask1 = (
-            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
-        )
-        if not return_dropout_mask:
-            if weight1 is None:
-                return y if not prenorm else (y, residual_out)
-            else:
-                return (y, y1) if not prenorm else (y, y1, residual_out)
-        else:
-            if weight1 is None:
-                return (
-                    (y, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, residual_out, dropout_mask, dropout_mask1)
-                )
-            else:
-                return (
-                    (y, y1, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, y1, residual_out, dropout_mask, dropout_mask1)
-                )
-    @staticmethod
-    def backward(ctx, dy, *args):
-        x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
-        dy = dy.reshape(-1, dy.shape[-1])
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if weight1 is not None:
-            dy1, args = args[0], args[1:]
-            dy1 = dy1.reshape(-1, dy1.shape[-1])
-            if dy1.stride(-1) != 1:
-                dy1 = dy1.contiguous()
-            assert dy1.shape == x.shape
-        else:
-            dy1 = None
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
-            dy,
-            x,
-            weight,
-            bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual,
-            dy1,
-            weight1,
-            bias1,
-            seeds,
-            ctx.dropout_p,
-            rowscale,
-            ctx.has_residual,
-            ctx.has_x1,
-            ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-        )
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dw,
-            db,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
-            dw1,
-            db1,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-        return_dropout_mask,
-    )
-def rms_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        True,
-        return_dropout_mask,
-    )
-class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if dropout_p > 0.0:
-            self.drop = torch.nn.Dropout(dropout_p)
-        else:
-            self.drop = None
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
-    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
-        return rms_norm_fn(
-            x,
-            self.weight,
-            self.bias,
-            residual=residual,
-            eps=self.eps,
-            dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
-            prenorm=prenorm,
-            residual_in_fp32=residual_in_fp32,
-        )
-class LayerNormLinearFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual=None,
-        eps=1e-6,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        norm_weight = norm_weight.contiguous()
-        if norm_bias is not None:
-            norm_bias = norm_bias.contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd(
-            x,
-            norm_weight,
-            norm_bias,
-            eps,
-            residual,
-            out_dtype=(
-                None
-                if not torch.is_autocast_enabled()
-                else torch.get_autocast_gpu_dtype()
-            ),
-            residual_dtype=residual_dtype,
-            is_rms_norm=is_rms_norm,
-        )
-        y = y.reshape(x_shape_og)
-        dtype = (
-            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
-        )
-        linear_weight = linear_weight.to(dtype)
-        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
-        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
-        # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(
-            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        ctx.linear_bias_is_none = linear_bias is None
-        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
-        dout = dout.reshape(-1, dout.shape[-1])
-        dy = F.linear(dout, linear_weight.t())
-        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = _layer_norm_bwd(
-            dy,
-            x,
-            norm_weight,
-            norm_bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual=dresidual,
-            has_residual=ctx.has_residual,
-            is_rms_norm=ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-            recompute_output=True,
-        )
-        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dnorm_weight,
-            dnorm_bias,
-            dlinear_weight,
-            dlinear_bias,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_linear_fn(
-    x,
-    norm_weight,
-    norm_bias,
-    linear_weight,
-    linear_bias,
-    residual=None,
-    eps=1e-6,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    return LayerNormLinearFn.apply(
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual,
-        eps,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-    )

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/selective_state_update.py DELETED Viewed

@@ -1,389 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or triton==2.2.0 or triton==2.3.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
-@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
-@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
-@triton.heuristics(
-    {
-        "HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"]
-        is not None
-    }
-)
-@triton.heuristics(
-    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
-)
-@triton.jit
-def _selective_scan_update_kernel(
-    # Pointers to matrices
-    state_ptr,
-    x_ptr,
-    dt_ptr,
-    dt_bias_ptr,
-    A_ptr,
-    B_ptr,
-    C_ptr,
-    D_ptr,
-    z_ptr,
-    out_ptr,
-    state_batch_indices_ptr,
-    # Matrix dimensions
-    batch,
-    nheads,
-    dim,
-    dstate,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_state_batch,
-    stride_state_head,
-    stride_state_dim,
-    stride_state_dstate,
-    stride_x_batch,
-    stride_x_head,
-    stride_x_dim,
-    stride_dt_batch,
-    stride_dt_head,
-    stride_dt_dim,
-    stride_dt_bias_head,
-    stride_dt_bias_dim,
-    stride_A_head,
-    stride_A_dim,
-    stride_A_dstate,
-    stride_B_batch,
-    stride_B_group,
-    stride_B_dstate,
-    stride_C_batch,
-    stride_C_group,
-    stride_C_dstate,
-    stride_D_head,
-    stride_D_dim,
-    stride_z_batch,
-    stride_z_head,
-    stride_z_dim,
-    stride_out_batch,
-    stride_out_head,
-    stride_out_dim,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    TIE_HDIM: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    HAS_D: tl.constexpr,
-    HAS_Z: tl.constexpr,
-    HAS_STATE_BATCH_INDICES: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_m = tl.program_id(axis=0)
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    if HAS_STATE_BATCH_INDICES:
-        state_batch_indices_ptr += pid_b
-        state_batch_idx = tl.load(state_batch_indices_ptr)
-        state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
-    else:
-        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
-    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
-    if HAS_DT_BIAS:
-        dt_bias_ptr += pid_h * stride_dt_bias_head
-    A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
-    if HAS_Z:
-        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
-    state_ptrs = state_ptr + (
-        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
-    )
-    x_ptrs = x_ptr + offs_m * stride_x_dim
-    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
-    if HAS_DT_BIAS:
-        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
-    if HAS_D:
-        D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (
-        offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
-    )
-    B_ptrs = B_ptr + offs_n * stride_B_dstate
-    C_ptrs = C_ptr + offs_n * stride_C_dstate
-    if HAS_D:
-        D_ptrs = D_ptr + offs_m * stride_D_dim
-    if HAS_Z:
-        z_ptrs = z_ptr + offs_m * stride_z_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    state = tl.load(
-        state_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-    )
-    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(
-            A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0
-        ).to(tl.float32)
-        dA = tl.exp(A * dt[:, None])
-    else:
-        dt = tl.load(dt_ptr).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptr).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(A_ptr).to(tl.float32)
-        dA = tl.exp(A * dt)  # scalar, not a matrix
-    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    if HAS_D:
-        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if HAS_Z:
-        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dB = B[None, :] * dt[:, None]
-    else:
-        dB = B * dt  # vector of size (dstate,)
-    state = state * dA + dB * x[:, None]
-    tl.store(
-        state_ptrs, state, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
-    )
-    out = tl.sum(state * C[None, :], axis=1)
-    if HAS_D:
-        out += x * D
-    if HAS_Z:
-        out *= z * tl.sigmoid(z)
-    tl.store(out_ptrs, out, mask=offs_m < dim)
-def selective_state_update(
-    state,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    state_batch_indices=None,
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    _, nheads, dim, dstate = state.shape
-    batch = x.shape[0]
-    if x.shape != (batch, nheads, dim):
-        print(f"{state.shape} {x.shape} {batch} {nheads} {dim}")
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-    if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch,)
-    out = torch.empty_like(x)
-    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
-    z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
-    # We don't want autotune since it will overwrite the state
-    # We instead tune by hand.
-    BLOCK_SIZE_M, num_warps = (
-        (32, 4)
-        if dstate <= 16
-        else (
-            (16, 4)
-            if dstate <= 32
-            else ((8, 4) if dstate <= 64 else ((4, 4) if dstate <= 128 else ((4, 8))))
-        )
-    )
-    tie_hdim = (
-        A.stride(-1) == 0
-        and A.stride(-2) == 0
-        and dt.stride(-1) == 0
-        and dt_bias.stride(-1) == 0
-    )
-    with torch.cuda.device(x.device.index):
-        _selective_scan_update_kernel[grid](
-            state,
-            x,
-            dt,
-            dt_bias,
-            A,
-            B,
-            C,
-            D,
-            z,
-            out,
-            state_batch_indices,
-            batch,
-            nheads,
-            dim,
-            dstate,
-            nheads // ngroups,
-            state.stride(0),
-            state.stride(1),
-            state.stride(2),
-            state.stride(3),
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
-            A.stride(0),
-            A.stride(1),
-            A.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            C.stride(0),
-            C.stride(1),
-            C.stride(2),
-            *(D.stride(0), D.stride(1)) if D is not None else 0,
-            z_strides[0],
-            z_strides[1],
-            z_strides[2],
-            out.stride(0),
-            out.stride(1),
-            out.stride(2),
-            dt_softplus,
-            tie_hdim,
-            BLOCK_SIZE_M,
-            num_warps=num_warps,
-        )
-    if not has_heads:
-        out = out.squeeze(1)
-    return out
-def selective_state_update_ref(
-    state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    batch, nheads, dim, dstate = state.shape
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-        dt = dt + dt_bias
-    dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(
-        rearrange(dt, "b h d -> b h d 1") * A
-    )  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
-        B, "b h n -> b h 1 n"
-    )  # (batch, nheads, dim, dstate)
-    state.copy_(
-        state * dA + dB * rearrange(x, "b h d -> b h d 1")
-    )  # (batch, dim, dstate
-    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
-    if D is not None:
-        out += (x * D).to(out.dtype)
-    out = (out if z is None else out * F.silu(z)).to(x.dtype)
-    if not has_heads:
-        out = out.squeeze(1)
-    return out

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_scan.py DELETED Viewed

The diff for this file is too large to render. See raw diff

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_chunk_state.py DELETED Viewed

@@ -1,2012 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_SIZE_H": 1}),
-        triton.Config({"BLOCK_SIZE_H": 2}),
-        triton.Config({"BLOCK_SIZE_H": 4}),
-        triton.Config({"BLOCK_SIZE_H": 8}),
-        triton.Config({"BLOCK_SIZE_H": 16}),
-        triton.Config({"BLOCK_SIZE_H": 32}),
-        triton.Config({"BLOCK_SIZE_H": 64}),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_fwd_kernel(
-    # Pointers to matrices
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    dt_out_ptr,
-    dA_cumsum_ptr,
-    # Matrix dimension
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_dt_out_batch,
-    stride_dt_out_chunk,
-    stride_dt_out_head,
-    stride_dt_out_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    dt_out_ptrs = dt_out_ptr + (
-        offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize
-    )
-    dA_cs_ptrs = dA_cumsum_ptr + (
-        offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize
-    )
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt = tl.where(dt <= 20.0, softplus(dt), dt)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    tl.store(
-        dt_out_ptrs,
-        dt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    dA = dt * A[:, None]
-    dA_cs = tl.cumsum(dA, axis=1)
-    tl.store(
-        dA_cs_ptrs,
-        dA_cs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_H": 1}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 2}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 4}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 8}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 16}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 32}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_H": 64}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])
-        ),
-    ],
-    key=["chunk_size", "nheads"],
-)
-@triton.jit
-def _chunk_cumsum_bwd_kernel(
-    # Pointers to matrices
-    ddA_ptr,
-    ddt_out_ptr,
-    dt_ptr,
-    A_ptr,
-    dt_bias_ptr,
-    ddt_ptr,
-    dA_ptr,
-    ddt_bias_ptr,
-    # Matrix dimensions
-    batch,
-    seqlen,
-    nheads,
-    chunk_size,
-    dt_min,
-    dt_max,
-    # Strides
-    stride_ddA_batch,
-    stride_ddA_chunk,
-    stride_ddA_head,
-    stride_ddA_csize,
-    stride_ddt_out_batch,
-    stride_ddt_out_chunk,
-    stride_ddt_out_head,
-    stride_ddt_out_csize,
-    stride_dt_batch,
-    stride_dt_seqlen,
-    stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_ddt_batch,
-    stride_ddt_seqlen,
-    stride_ddt_head,
-    stride_dA_head,
-    stride_ddt_bias_head,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr,
-    BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    ddt_out_ptr += pid_b * stride_ddt_out_batch + pid_c * stride_ddt_out_chunk
-    ddA_ptr += pid_b * stride_ddA_batch + pid_c * stride_ddA_chunk
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    ddt_ptr += pid_b * stride_ddt_batch + pid_c * chunk_size * stride_ddt_seqlen
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    ddt_out_ptrs = ddt_out_ptr + (
-        offs_h[:, None] * stride_ddt_out_head + offs_c[None, :] * stride_ddt_out_csize
-    )
-    ddA_ptrs = ddA_ptr + (
-        offs_h[:, None] * stride_ddA_head + offs_c[None, :] * stride_ddA_csize
-    )
-    dt_ptrs = dt_ptr + (
-        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
-    )
-    ddt_ptrs = ddt_ptr + (
-        offs_h[:, None] * stride_ddt_head + offs_c[None, :] * stride_ddt_seqlen
-    )
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    ddA = tl.load(
-        ddA_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    ddt_out = tl.load(
-        ddt_out_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    ddt = ddA * A[:, None] + ddt_out
-    dt = tl.load(
-        dt_ptrs,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(
-            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
-        ).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt_presoftplus = dt
-        dt = tl.where(dt <= 20.0, softplus(dt), ddt)
-    clamp_mask = (dt < dt_min) | (dt > dt_max)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
-    )
-    ddt = tl.where(
-        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), ddt, 0.0
-    )
-    ddt = tl.where(clamp_mask, 0.0, ddt)
-    if DT_SOFTPLUS:
-        ddt = tl.where(dt_presoftplus <= 20.0, ddt * tl.sigmoid(dt_presoftplus), ddt)
-    tl.store(
-        ddt_ptrs,
-        ddt,
-        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
-    )
-    dA = tl.sum(ddA * dt, axis=1)
-    tl.atomic_add(dA_ptr + offs_h * stride_dA_head, dA, mask=offs_h < nheads)
-    if HAS_DT_BIAS:
-        ddt_bias = tl.sum(ddt, axis=1)
-        tl.atomic_add(
-            ddt_bias_ptr + offs_h * stride_ddt_bias_head, ddt_bias, mask=offs_h < nheads
-        )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_fwd_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    states_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_states_batch,
-    stride_states_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    if HAS_SEQ_IDX:
-        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    if HAS_SEQ_IDX:
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        if HAS_SEQ_IDX:
-            seq_idx_k = tl.load(
-                seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1
-            )
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        if not HAS_SEQ_IDX:
-            scale = tl.exp((dA_cs_last - dA_cs_k)) * dt_k
-        else:
-            scale = tl.where(
-                seq_idx_k == seq_idx_last, tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0
-            )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-        if HAS_SEQ_IDX:
-            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += (
-        pid_b * stride_states_batch
-        + pid_c * stride_states_chunk
-        + pid_h * stride_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    dx_ptr,
-    ddt_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-        tl.float32
-    )
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    acc *= tl.exp(dA_cs_last - dA_cs_m)[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    ddA_cs = -(ddt * dt_m)
-    ddA_cs_last = -tl.sum(ddA_cs)
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(ddA_cumsum_ptr + (chunk_size - 1) * stride_ddA_cs_csize, ddA_cs_last)
-    dx = (acc * dt_m[:, None]).to(dx_ptr.dtype.element_ty)
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "dstate", "hdim"],
-)
-@triton.jit
-def _chunk_state_bwd_db_kernel(
-    # Pointers to matrices
-    x_ptr,
-    dstates_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    db_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    dstate,
-    hdim,
-    batch,
-    seqlen,
-    nheads,
-    nheads_per_program,
-    ngroups,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_db_batch,
-    stride_db_seqlen,
-    stride_db_split,
-    stride_db_group,
-    stride_db_dstate,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_DDA_CS: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_sg = tl.program_id(axis=2)
-    pid_s = pid_sg // ngroups
-    pid_g = pid_sg - pid_s * ngroups
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_x_head
-    )
-    db_ptr += (
-        pid_b * stride_db_batch
-        + pid_c * chunk_size * stride_db_seqlen
-        + pid_g * stride_db_group
-        + pid_s * stride_db_split
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-        * stride_states_head
-    )
-    dt_ptr += (
-        pid_b * stride_dt_batch
-        + pid_c * stride_dt_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
-    )
-    if HAS_DDA_CS:
-        b_ptr += (
-            pid_b * stride_b_batch
-            + pid_c * chunk_size * stride_b_seqlen
-            + pid_g * stride_b_head
-        )
-        ddA_cumsum_ptr += (
-            pid_b * stride_ddA_cs_batch
-            + pid_c * stride_ddA_cs_chunk
-            + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program)
-            * stride_ddA_cs_head
-        )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_k[None, :] * stride_x_hdim
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_dstate + offs_k[:, None] * stride_states_hdim
-    )
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    if HAS_DDA_CS:
-        b_ptrs = b_ptr + (
-            offs_m[:, None] * stride_b_seqlen + offs_n[None, :] * stride_b_dstate
-        )
-        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    if HAS_DDA_CS:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-    if HAS_SEQ_IDX:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-    nheads_iter = min(
-        nheads_per_program, nheads // ngroups - pid_s * nheads_per_program
-    )
-    for h in range(nheads_iter):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = dstates.to(x_ptrs.dtype.element_ty)
-        db = tl.dot(x, dstates)
-        dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-            tl.float32
-        )
-        dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(
-            tl.float32
-        )
-        dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-        if not HAS_SEQ_IDX:
-            scale = tl.exp(dA_cs_last - dA_cs_m)
-        else:
-            scale = tl.where(
-                seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0
-            )
-        db *= (scale * dt_m)[:, None]
-        if HAS_DDA_CS:
-            # This is the gradient wrt (dA_cs_last - dA_cs_m), i.e. the exclusive reverse cumsum
-            ddA_cs = tl.sum(db * b, axis=1)
-            tl.atomic_add(
-                ddA_cumsum_ptrs + stride_ddA_cs_csize,
-                ddA_cs,
-                mask=offs_m < chunk_size - 1,
-            )
-        acc += db
-        x_ptrs += stride_x_head
-        dstates_ptrs += stride_states_head
-        dt_ptrs += stride_dt_head
-        dA_cumsum_ptr += stride_dA_cs_head
-        dA_cumsum_ptrs += stride_dA_cs_head
-        if HAS_DDA_CS:
-            ddA_cumsum_ptrs += stride_ddA_cs_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    # if HAS_SEQ_IDX:
-    #     seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-    #     seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-    #     acc = tl.where(seq_idx_m[:, None] == seq_idx_last, acc, 0.0)
-    db_ptrs = db_ptr + (
-        offs_m[:, None] * stride_db_seqlen + offs_n[None, :] * stride_db_dstate
-    )
-    tl.store(
-        db_ptrs,
-        acc,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate),
-    )
-@triton.autotune(
-    configs=[
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=3,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddA_cumsum_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_state_bwd_ddAcs_stable_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dstates_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_ddA_cs_batch,
-    stride_ddA_cs_chunk,
-    stride_ddA_cs_head,
-    stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_states_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddA_cumsum_ptr += (
-        pid_b * stride_ddA_cs_batch
-        + pid_c * stride_ddA_cs_chunk
-        + pid_h * stride_ddA_cs_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(
-        0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate
-    )
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_k[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    acc *= scale[:, None]
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    # ddA_cs = -(ddt * dt_m)
-    # Triton 2.2.0 errors if we have the cumsum here, so we just write it out
-    # then call torch.cumsum outside this kernel.
-    # ddA_cs = tl.cumsum(ddt * dt_m)
-    ddA_cs = ddt * dt_m
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    # tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(
-        ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1
-    )
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=2,
-        ),
-    ],
-    key=["hdim", "dstate", "chunk_size"],
-)
-@triton.jit
-def _chunk_state_varlen_kernel(
-    # Pointers to matrices
-    x_ptr,
-    b_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    chunk_states_ptr,
-    cu_seqlens_ptr,
-    states_ptr,
-    # Matrix dimensions
-    hdim,
-    dstate,
-    chunk_size,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_chunk_states_chunk,
-    stride_chunk_states_head,
-    stride_chunk_states_hdim,
-    stride_chunk_states_dstate,
-    stride_states_batch,
-    stride_states_head,
-    stride_states_hdim,
-    stride_states_dstate,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
-    pid_c = (end_idx - 1) // chunk_size
-    b_ptr += (
-        pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    chunk_states_ptr += (
-        pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
-    )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
-    )
-    b_ptrs = b_ptr + (
-        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
-    )
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(
-        dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
-    ).to(tl.float32)
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    chunk_size_limit = end_idx - pid_c * chunk_size
-    start_idx = tl.load(cu_seqlens_ptr + pid_b)
-    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(
-            x_ptrs,
-            mask=(offs_m[:, None] < hdim)
-            & (offs_k[None, :] < chunk_size_limit - k)
-            & (offs_k[None, :] >= start_idx_cur - k),
-            other=0.0,
-        )
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_k[:, None] < chunk_size_limit - k)
-            & (offs_n[None, :] < dstate)
-            & (offs_k[:, None] >= start_idx_cur - k),
-            other=0.0,
-        ).to(tl.float32)
-        dA_cs_k = tl.load(
-            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
-        ).to(tl.float32)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
-            tl.float32
-        )
-        scale = tl.where(
-            (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
-            tl.exp((dA_cs_last - dA_cs_k)) * dt_k,
-            0.0,
-        )
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
-    if start_idx < pid_c * chunk_size:
-        chunk_states_ptrs = chunk_states_ptr + (
-            offs_m[:, None] * stride_chunk_states_hdim
-            + offs_n[None, :] * stride_chunk_states_dstate
-        )
-        chunk_states = tl.load(
-            chunk_states_ptrs,
-            mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate),
-            other=0.0,
-        ).to(tl.float32)
-        # scale = tl.where(start_idx < pid_c * chunk_size, tl.exp(dA_cs_last), 0.0)
-        scale = tl.exp(dA_cs_last)
-        acc += chunk_states * scale
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (
-        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
-    )
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-def _chunk_cumsum_fwd(
-    dt, A, chunk_size, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))
-):
-    batch, seqlen, nheads = dt.shape
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-    nchunks = math.ceil(seqlen / chunk_size)
-    dt_out = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    dA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
-            dt,
-            A,
-            dt_bias,
-            dt_out,
-            dA_cumsum,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            dt_out.stride(0),
-            dt_out.stride(2),
-            dt_out.stride(1),
-            dt_out.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return dA_cumsum, dt_out
-def _chunk_cumsum_bwd(
-    ddA,
-    ddt_out,
-    dt,
-    A,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    ddt=None,
-):
-    batch, seqlen, nheads = dt.shape
-    _, _, nchunks, chunk_size = ddA.shape
-    assert ddA.shape == (batch, nheads, nchunks, chunk_size)
-    assert ddt_out.shape == (batch, nheads, nchunks, chunk_size)
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-        ddt_bias = torch.empty_like(dt_bias, dtype=torch.float32)
-    else:
-        ddt_bias = None
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-    else:
-        ddt = torch.empty_like(dt)
-    dA = torch.empty_like(A, dtype=torch.float32)
-    grid_chunk_cs = lambda META: (
-        batch,
-        nchunks,
-        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
-    )
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_bwd_kernel[grid_chunk_cs](
-            ddA,
-            ddt_out,
-            dt,
-            A,
-            dt_bias,
-            ddt,
-            dA,
-            ddt_bias,
-            batch,
-            seqlen,
-            nheads,
-            chunk_size,
-            dt_limit[0],
-            dt_limit[1],
-            ddA.stride(0),
-            ddA.stride(2),
-            ddA.stride(1),
-            ddA.stride(3),
-            ddt_out.stride(0),
-            ddt_out.stride(2),
-            ddt_out.stride(1),
-            ddt_out.stride(3),
-            dt.stride(0),
-            dt.stride(1),
-            dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            ddt.stride(0),
-            ddt.stride(1),
-            ddt.stride(2),
-            dA.stride(0),
-            ddt_bias.stride(0) if ddt_bias is not None else 0,
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return ddt, dA, ddt_bias
-def _chunk_state_fwd(
-    B, x, dt, dA_cumsum, seq_idx=None, states=None, states_in_fp32=True
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if states is not None:
-        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
-    else:
-        states_dtype = torch.float32 if states_in_fp32 else B.dtype
-        states = torch.empty(
-            (batch, nchunks, nheads, headdim, dstate),
-            device=x.device,
-            dtype=states_dtype,
-        )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_fwd_kernel[grid](
-            x,
-            B,
-            states,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            headdim,
-            dstate,
-            chunk_size,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-            states.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    return states
-def _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates, dx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if dx is not None:
-        assert dx.shape == x.shape
-    else:
-        dx = torch.empty_like(x)
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
-    )
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dA_cumsum.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            dx,
-            ddt,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    return dx, ddt.to(dt.dtype), ddA_cumsum.to(dA_cumsum.dtype)
-def _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=None, B=None, ngroups=1):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    dstate = dstates.shape[-1]
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B is not None:
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        B_strides = (B.stride(0), B.stride(1), B.stride(2), B.stride(3))
-        # Use torch.empty since the Triton kernel will call init_to_zero
-        ddA_cumsum = torch.empty(
-            batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-        )
-        ddA_cumsum_strides = (
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-        )
-    else:
-        B_strides = (0, 0, 0, 0)
-        ddA_cumsum = None
-        ddA_cumsum_strides = (0, 0, 0, 0)
-    nheads_ngroups_ratio = nheads // ngroups
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    nheads_per_program = max(
-        min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1
-    )
-    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
-    dB = torch.empty(
-        batch, seqlen, nsplits, ngroups, dstate, device=x.device, dtype=torch.float32
-    )
-    grid_db = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nsplits * ngroups,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_db_kernel[grid_db](
-            x,
-            dstates,
-            B,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            dB,
-            ddA_cumsum,
-            chunk_size,
-            dstate,
-            headdim,
-            batch,
-            seqlen,
-            nheads,
-            nheads_per_program,
-            ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            *B_strides,
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            dB.stride(0),
-            dB.stride(1),
-            dB.stride(2),
-            dB.stride(3),
-            dB.stride(4),
-            *ddA_cumsum_strides,
-            HAS_DDA_CS=ddA_cumsum is not None,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
-        )
-    dB = dB.sum(2)
-    if ddA_cumsum is not None:
-        # The first element of ddA_cumsum is always zero, since that dA_cumsum does not contribute
-        # to the state of the chunk.
-        # torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-        # But it's easier to just do the cumsum for all elements, the result will be the same.
-        torch.cumsum(ddA_cumsum, dim=-1, out=ddA_cumsum)
-    return dB if B is None else (dB, ddA_cumsum)
-def _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    # Use torch.empty since the Triton kernel will call init_to_zero
-    ddA_cumsum = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32
-    )
-    grid_ddtcs = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_ddAcs_stable_kernel[grid_ddtcs](
-            x,
-            B,
-            dstates,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            ddA_cumsum,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(-1),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            ddA_cumsum.stride(0),
-            ddA_cumsum.stride(2),
-            ddA_cumsum.stride(1),
-            ddA_cumsum.stride(3),
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_M=max(triton.next_power_of_2(chunk_size), 16),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-    return ddA_cumsum
-def chunk_state_varlen(B, x, dt, dA_cumsum, cu_seqlens, chunk_states):
-    total_seqlen, nheads, headdim = x.shape
-    _, nchunks, chunk_size = dt.shape
-    _, ngroups, dstate = B.shape
-    batch = cu_seqlens.shape[0] - 1
-    cu_seqlens = cu_seqlens.contiguous()
-    assert nheads % ngroups == 0
-    assert B.shape == (total_seqlen, ngroups, dstate)
-    assert dt.shape == (nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
-    states = torch.empty(
-        batch,
-        nheads,
-        headdim,
-        dstate,
-        dtype=chunk_states.dtype,
-        device=chunk_states.device,
-    )
-    grid = lambda META: (
-        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
-        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
-        batch,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_state_varlen_kernel[grid](
-            x,
-            B,
-            dt,
-            dA_cumsum,
-            chunk_states,
-            cu_seqlens,
-            states,
-            headdim,
-            dstate,
-            chunk_size,
-            total_seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            dt.stride(1),
-            dt.stride(0),
-            dt.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            chunk_states.stride(0),
-            chunk_states.stride(1),
-            chunk_states.stride(2),
-            chunk_states.stride(3),
-            states.stride(0),
-            states.stride(1),
-            states.stride(2),
-            states.stride(3),
-        )
-    return states
-class ChunkStateFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, B, x, dt, dA_cumsum, states_in_fp32=True):
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        assert seqlen <= nchunks * chunk_size
-        _, _, ngroups, dstate = B.shape
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        assert dt.shape == (batch, nheads, nchunks, chunk_size)
-        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if (
-            x.stride(-1) != 1 and x.stride(1) != 1
-        ):  # Either M or K dimension should be contiguous
-            x = x.contiguous()
-        states = _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=states_in_fp32)
-        ctx.save_for_backward(B, x, dt, dA_cumsum)
-        return states
-    @staticmethod
-    def backward(ctx, dstates):
-        B, x, dt, dA_cumsum = ctx.saved_tensors
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        _, _, ngroups, dstate = B.shape
-        assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-        if dstates.stride(-1) != 1:
-            dstates = dstates.contiguous()
-        dx, ddt, ddA_cumsum = _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates)
-        dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, ngroups=ngroups)
-        dB = dB.to(B.dtype)
-        return dB, dx, ddt, ddA_cumsum, None
-def chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    return ChunkStateFn.apply(B, x, dt, dA_cumsum, states_in_fp32)
-def chunk_state_ref(B, x, dt, dA_cumsum):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    # Check constraints.
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    _, _, nchunks, chunk_size = dt.shape
-    assert seqlen <= nchunks * chunk_size
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    ngroups = B.shape[2]
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
-    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-    if seqlen < nchunks * chunk_size:
-        x = F.pad(x, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-        B = F.pad(B, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-    x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
-    B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
-    decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum(
-        "bclhn,bhcl,bhcl,bclhp->bchpn",
-        B.to(x.dtype),
-        decay_states.to(x.dtype),
-        dt.to(x.dtype),
-        x,
-    )

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/ops/triton/ssd_combined.py DELETED Viewed

@@ -1,1884 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-from typing import Optional
-import math
-from packaging import version
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_cuda = None, None
-from .ssd_bmm import _bmm_chunk_fwd, _bmm_chunk_bwd
-from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_cumsum_bwd
-from .ssd_chunk_state import _chunk_state_fwd, _chunk_state_bwd_db
-from .ssd_chunk_state import _chunk_state_bwd_ddAcs_stable
-from .ssd_chunk_state import chunk_state, chunk_state_ref
-from .ssd_chunk_state import chunk_state_varlen
-from .ssd_state_passing import _state_passing_fwd, _state_passing_bwd
-from .ssd_state_passing import state_passing, state_passing_ref
-from .ssd_chunk_scan import _chunk_scan_fwd, _chunk_scan_bwd_dz, _chunk_scan_bwd_dstates
-from .ssd_chunk_scan import _chunk_scan_bwd_dC, _chunk_scan_bwd_dcb
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_stable
-from .ssd_chunk_scan import chunk_scan, chunk_scan_ref
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_prev
-from .layernorm_gated import rmsnorm_fn, _layer_norm_fwd, _layer_norm_bwd
-from .k_activations import _swiglu_fwd, _swiglu_bwd
-TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
-def init_to_zero(names):
-    return lambda nargs: [
-        nargs[name].zero_() for name in names if nargs[name] is not None
-    ]
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-            pre_hook=init_to_zero(["ddt_ptr"]),
-        ),
-    ],
-    key=["chunk_size", "hdim", "dstate"],
-)
-@triton.jit
-def _chunk_scan_chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr,
-    cb_ptr,
-    dout_ptr,
-    dt_ptr,
-    dA_cumsum_ptr,
-    seq_idx_ptr,
-    D_ptr,
-    b_ptr,
-    dstates_ptr,
-    dx_ptr,
-    ddt_ptr,
-    dD_ptr,
-    # Matrix dimensions
-    chunk_size,
-    hdim,
-    dstate,
-    batch,
-    seqlen,
-    nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch,
-    stride_x_seqlen,
-    stride_x_head,
-    stride_x_hdim,
-    stride_cb_batch,
-    stride_cb_chunk,
-    stride_cb_head,
-    stride_cb_csize_m,
-    stride_cb_csize_k,
-    stride_dout_batch,
-    stride_dout_seqlen,
-    stride_dout_head,
-    stride_dout_hdim,
-    stride_dt_batch,
-    stride_dt_chunk,
-    stride_dt_head,
-    stride_dt_csize,
-    stride_dA_cs_batch,
-    stride_dA_cs_chunk,
-    stride_dA_cs_head,
-    stride_dA_cs_csize,
-    stride_seq_idx_batch,
-    stride_seq_idx_seqlen,
-    stride_D_head,
-    stride_b_batch,
-    stride_b_seqlen,
-    stride_b_head,
-    stride_b_dstate,
-    stride_dstates_batch,
-    stride_dstates_chunk,
-    stride_dstates_head,
-    stride_dstates_hdim,
-    stride_dstates_dstate,
-    stride_dx_batch,
-    stride_dx_seqlen,
-    stride_dx_head,
-    stride_dx_hdim,
-    stride_ddt_batch,
-    stride_ddt_chunk,
-    stride_ddt_head,
-    stride_ddt_csize,
-    stride_dD_batch,
-    stride_dD_chunk,
-    stride_dD_head,
-    stride_dD_csize,
-    stride_dD_hdim,
-    # Meta-parameters
-    HAS_D: tl.constexpr,
-    D_HAS_HDIM: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-    IS_TRITON_22: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += (
-        pid_b * stride_x_batch
-        + pid_c * chunk_size * stride_x_seqlen
-        + pid_h * stride_x_head
-    )
-    cb_ptr += (
-        pid_b * stride_cb_batch
-        + pid_c * stride_cb_chunk
-        + (pid_h // nheads_ngroups_ratio) * stride_cb_head
-    )
-    dout_ptr += (
-        pid_b * stride_dout_batch
-        + pid_c * chunk_size * stride_dout_seqlen
-        + pid_h * stride_dout_head
-    )
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += (
-        pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    )
-    dA_cumsum_ptr += (
-        pid_b * stride_dA_cs_batch
-        + pid_c * stride_dA_cs_chunk
-        + pid_h * stride_dA_cs_head
-    )
-    b_ptr += (
-        pid_b * stride_b_batch
-        + pid_c * chunk_size * stride_b_seqlen
-        + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    )
-    dstates_ptr += (
-        pid_b * stride_dstates_batch
-        + pid_c * stride_dstates_chunk
-        + pid_h * stride_dstates_head
-    )
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += (
-            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-        )
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    dA_cs_m = tl.load(
-        dA_cumsum_ptr + offs_m * stride_dA_cs_csize,
-        mask=offs_m < chunk_size_limit,
-        other=0.0,
-    ).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
-        tl.float32
-    )
-    if not HAS_SEQ_IDX:
-        scale = tl.exp(dA_cs_last - dA_cs_m)
-    else:
-        seq_idx_m = tl.load(
-            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
-            mask=offs_m < chunk_size_limit,
-            other=-1,
-        )
-        seq_idx_last = tl.load(
-            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
-        )
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-    # Might be faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    # However, we're getting error with the Triton compiler 2.1.0 for that code path:
-    # Unexpected mma -> mma layout conversion
-    # Triton 2.2.0 fixes this
-    offs_dstate = tl.arange(
-        0,
-        (
-            BLOCK_SIZE_DSTATE
-            if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128
-            else BLOCK_SIZE_K
-        ),
-    )
-    b_ptrs = b_ptr + (
-        offs_m[:, None] * stride_b_seqlen + offs_dstate[None, :] * stride_b_dstate
-    )
-    dstates_ptrs = dstates_ptr + (
-        offs_n[None, :] * stride_dstates_hdim
-        + offs_dstate[:, None] * stride_dstates_dstate
-    )
-    if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(
-            b_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate),
-            other=0.0,
-        )
-        dstates = tl.load(
-            dstates_ptrs,
-            mask=(offs_dstate[:, None] < dstate) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates) * scale[:, None]
-    else:
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(
-                b_ptrs,
-                mask=(offs_m[:, None] < chunk_size_limit)
-                & (offs_dstate[None, :] < dstate - k),
-                other=0.0,
-            )
-            dstates = tl.load(
-                dstates_ptrs,
-                mask=(offs_dstate[:, None] < dstate - k) & (offs_n[None, :] < hdim),
-                other=0.0,
-            )
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_dstates_dstate
-        acc *= scale[:, None]
-    # x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    # x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    # dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    # dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    # ddt = tl.sum(acc * x, axis=1) * dt_m
-    # ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    # tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    cb_ptrs = cb_ptr + (
-        offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k
-    )
-    dout_ptrs = dout_ptr + (
-        offs_k[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-    )
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    K_MAX = chunk_size_limit
-    K_MIN = pid_m * BLOCK_SIZE_M
-    cb_ptrs += K_MIN * stride_cb_csize_k
-    dout_ptrs += K_MIN * stride_dout_seqlen
-    dA_cumsum_ptrs += K_MIN * stride_dA_cs_csize
-    for k in range(K_MIN, K_MAX, BLOCK_SIZE_K):
-        k = tl.multiple_of(k, BLOCK_SIZE_K)
-        # For some reason setting mask to (offs_m[:, None] < chunk_size_limit) is much slower
-        cb = tl.load(
-            cb_ptrs,
-            mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < K_MAX - k),
-            other=0.0,
-        )
-        dout = tl.load(
-            dout_ptrs,
-            mask=(offs_k[:, None] < K_MAX - k) & (offs_n[None, :] < hdim),
-            other=0.0,
-        )
-        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < K_MAX - k, other=0.0).to(
-            tl.float32
-        )
-        cb *= tl.exp(dA_cs_k[None, :] - dA_cs_m[:, None])
-        # If we don't have the (k + offs_k[None, :] < K_MAX) mask, for indices outside this range,
-        # we might have dA_cs_m = 0.0 and dA_cs_k very negative, and tl.exp will return inf.
-        # Multiplying with cb, which is 0.0 outside the range, will make the result NaN.
-        # This will cause NaN in acc, and hence NaN in dx and ddt.
-        mask = (k + offs_k[None, :] >= offs_m[:, None]) & (k + offs_k[None, :] < K_MAX)
-        cb = tl.where(mask, cb, 0.0)
-        cb = cb.to(dout_ptr.dtype.element_ty)
-        acc += tl.dot(cb, dout)
-        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
-        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    dx = acc * dt_m[:, None]
-    dx_ptr += (
-        pid_b * stride_dx_batch
-        + pid_c * chunk_size * stride_dx_seqlen
-        + pid_h * stride_dx_head
-    )
-    dx_ptrs = dx_ptr + (
-        offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim
-    )
-    if HAS_D:
-        dout_res_ptrs = dout_ptr + (
-            offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim
-        )
-        dout_res = tl.load(
-            dout_res_ptrs,
-            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-            other=0.0,
-        ).to(tl.float32)
-        if D_HAS_HDIM:
-            D = tl.load(
-                D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0
-            ).to(tl.float32)
-        else:
-            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
-        dx += dout_res * D
-    tl.store(
-        dx_ptrs,
-        dx,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-    )
-    x_ptrs = x_ptr + (
-        offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
-    )
-    x = tl.load(
-        x_ptrs,
-        mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
-        other=0.0,
-    ).to(tl.float32)
-    if HAS_D:
-        dD_ptr += (
-            pid_b * stride_dD_batch
-            + pid_c * stride_dD_chunk
-            + pid_h * stride_dD_head
-            + pid_m * stride_dD_csize
-        )
-        if D_HAS_HDIM:
-            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
-            dD = tl.sum(dout_res * x, axis=0)
-            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
-        else:
-            dD = tl.sum(dout_res * x)
-            tl.store(dD_ptr, dD)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-def _chunk_scan_chunk_state_bwd_dx(
-    x, dt, dA_cumsum, B, CB, dout, dstates, D=None, seq_idx=None, dx=None
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert CB.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dout.shape == x.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-        assert D.stride(-1) == 1
-        BLOCK_SIZE_min = 32
-        dD = torch.empty(
-            triton.cdiv(chunk_size, BLOCK_SIZE_min),
-            batch,
-            nchunks,
-            nheads,
-            headdim if D.dim() == 2 else 1,
-            device=D.device,
-            dtype=torch.float32,
-        )
-    else:
-        dD = None
-    dD_strides = (
-        (dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
-        if D is not None
-        else (0, 0, 0, 0, 0)
-    )
-    if dx is None:
-        dx = torch.empty_like(x)
-    else:
-        assert dx.shape == x.shape
-    ddt = torch.empty(
-        batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32
-    )
-    grid_dx = lambda META: (
-        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
-        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
-        batch * nchunks,
-        nheads,
-    )
-    with torch.cuda.device(x.device.index):
-        _chunk_scan_chunk_state_bwd_dx_kernel[grid_dx](
-            x,
-            CB,
-            dout,
-            dt,
-            dA_cumsum,
-            seq_idx,
-            D,
-            B,
-            dstates,
-            dx,
-            ddt,
-            dD,
-            chunk_size,
-            headdim,
-            dstate,
-            batch,
-            seqlen,
-            nheads // ngroups,
-            x.stride(0),
-            x.stride(1),
-            x.stride(2),
-            x.stride(3),
-            CB.stride(0),
-            CB.stride(1),
-            CB.stride(2),
-            CB.stride(-1),
-            CB.stride(-2),
-            dout.stride(0),
-            dout.stride(1),
-            dout.stride(2),
-            dout.stride(3),
-            dt.stride(0),
-            dt.stride(2),
-            dt.stride(1),
-            dt.stride(3),
-            dA_cumsum.stride(0),
-            dA_cumsum.stride(2),
-            dA_cumsum.stride(1),
-            dA_cumsum.stride(3),
-            *(
-                (seq_idx.stride(0), seq_idx.stride(1))
-                if seq_idx is not None
-                else (0, 0)
-            ),
-            D.stride(0) if D is not None else 0,
-            B.stride(0),
-            B.stride(1),
-            B.stride(2),
-            B.stride(3),
-            dstates.stride(0),
-            dstates.stride(1),
-            dstates.stride(2),
-            dstates.stride(3),
-            dstates.stride(4),
-            dx.stride(0),
-            dx.stride(1),
-            dx.stride(2),
-            dx.stride(3),
-            ddt.stride(0),
-            ddt.stride(2),
-            ddt.stride(1),
-            ddt.stride(3),
-            dD_strides[1],
-            dD_strides[2],
-            dD_strides[3],
-            dD_strides[0],
-            dD_strides[4],
-            D is not None,
-            D.dim() == 2 if D is not None else True,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-            IS_TRITON_22=TRITON_22
-        )
-    if D is not None:
-        BLOCK_SIZE_actual = _chunk_scan_chunk_state_bwd_dx_kernel.best_config.kwargs[
-            "BLOCK_SIZE_M"
-        ]
-        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
-        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
-        if D.dim() == 1:
-            dD = rearrange(dD, "h 1 -> h")
-    return dx, ddt.to(dtype=dt.dtype), dD
-def _mamba_chunk_scan_combined_fwd(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert C.shape == B.shape
-    if z is not None:
-        assert z.shape == x.shape
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if (
-        x.stride(-1) != 1 and x.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        x = x.contiguous()
-    if (
-        z is not None and z.stride(-1) != 1 and z.stride(1) != 1
-    ):  # Either M or K dimension should be contiguous
-        z = z.contiguous()
-    if D is not None and D.stride(-1) != 1:
-        D = D.contiguous()
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    # # (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, nheads, chunk_size, chunk_size)
-    # dA_cumsum_tmp0, dt_tmp0 = _chunk_cumsum_fwd(dt[:, :147], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp1, dt_tmp1 = _chunk_cumsum_fwd(dt[:, 147:], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp2, dt_tmp2 = _chunk_cumsum_fwd(dt[:, 147:256], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit
-    )
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    # states_tmp0 = _chunk_state_fwd(B[:, :147], x[:, :147], dt_tmp0, dA_cumsum_tmp0, states_in_fp32=True)
-    # states_tmp1 = _chunk_state_fwd(B[:, 147:], x[:, 147:], dt_tmp1, dA_cumsum_tmp1, states_in_fp32=True)
-    # states_tmp2 = _chunk_state_fwd(B[:, 147:256], x[:, 147:256], dt_tmp2, dA_cumsum_tmp2, states_in_fp32=True)
-    states, final_states = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-        out_dtype=C.dtype,
-    )
-    states, final_states = [
-        rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]
-    ]
-    # states_tmp0 = rearrange(_state_passing_fwd(rearrange(states_tmp0, "... p n -> ... (p n)"), dA_cumsum_tmp0[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    # states_tmp1 = rearrange(_state_passing_fwd(rearrange(states_tmp1, "... p n -> ... (p n)"), dA_cumsum_tmp1[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    out, out_x = _chunk_scan_fwd(
-        CB, x, dt, dA_cumsum, C, states, D=D, z=z, seq_idx=seq_idx
-    )
-    if cu_seqlens is None:
-        return out, out_x, dt, dA_cumsum, states, final_states
-    else:
-        assert (
-            batch == 1
-        ), "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
-        varlen_states = chunk_state_varlen(
-            B.squeeze(0),
-            x.squeeze(0),
-            dt.squeeze(0),
-            dA_cumsum.squeeze(0),
-            cu_seqlens,
-            states.squeeze(0),
-        )
-        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
-def _mamba_chunk_scan_combined_bwd(
-    dout,
-    x,
-    dt,
-    A,
-    B,
-    C,
-    out,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    dfinal_states=None,
-    seq_idx=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    dx=None,
-    ddt=None,
-    dB=None,
-    dC=None,
-    dz=None,
-    recompute_output=False,
-):
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    batch, seqlen, nheads, headdim = x.shape
-    nchunks = math.ceil(seqlen / chunk_size)
-    _, _, ngroups, dstate = B.shape
-    assert dout.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert C.shape == B.shape
-    assert out.shape == x.shape
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if dx is not None:
-        assert dx.shape == x.shape
-    if dB is not None:
-        assert dB.shape == B.shape
-        dB_given = dB
-    else:
-        dB_given = torch.empty_like(B)
-    if dC is not None:
-        assert dC.shape == C.shape
-        dC_given = dC
-    else:
-        dC_given = torch.empty_like(C)
-    if dz is not None:
-        assert z is not None
-        assert dz.shape == z.shape
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-        ddt_given = ddt
-    else:
-        ddt_given = torch.empty_like(dt)
-    # TD: For some reason Triton (2.1.0 and 2.2.0) errors with
-    # "[CUDA]: invalid device context" (e.g. during varlne test), and cloning makes it work. Idk why.
-    dt_in = dt.clone()
-    dA_cumsum, dt = _chunk_cumsum_fwd(
-        dt_in,
-        A,
-        chunk_size,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    states, _ = _state_passing_fwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        initial_states=(
-            rearrange(initial_states, "... p n -> ... (p n)")
-            if initial_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        chunk_size=chunk_size,
-    )
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    if z is not None:
-        dz, dout, dD, *rest = _chunk_scan_bwd_dz(
-            x,
-            z,
-            out,
-            dout,
-            chunk_size=chunk_size,
-            has_ddAcs=False,
-            D=D,
-            dz=dz,
-            recompute_output=recompute_output,
-        )
-        outz = rest[0] if recompute_output else out
-    else:
-        dz = None
-        outz = out
-    dstates = _chunk_scan_bwd_dstates(
-        C, dA_cumsum, dout, seq_idx=seq_idx, dtype=states.dtype
-    )
-    # dstates has length nchunks, containing the gradient to initial states at index 0 and
-    # gradient to the states of chunk (nchunks - 2) at index (nchunks - 1)
-    # Do computation in fp32 but convert dstates and states to fp16/bf16 since dstates and states
-    # will be used in matmul in the next kernels.
-    dstates, ddA_chunk_cumsum, dinitial_states, states = _state_passing_bwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        rearrange(dstates, "... p n -> ... (p n)"),
-        dfinal_states=(
-            rearrange(dfinal_states, "... p n -> ... (p n)")
-            if dfinal_states is not None
-            else None
-        ),
-        seq_idx=seq_idx,
-        has_initial_states=initial_states is not None,
-        dstates_dtype=x.dtype,
-        states_dtype=x.dtype,
-        chunk_size=chunk_size,
-    )
-    # dstates has length nchunks, containing the gradient to states of chunk 0 at index 0 and
-    # gradient to the final states at index (nchunks - 1)
-    # states has length nchunks, containing the initial states at index 0 and the state for chunk (nchunks - 2) at index (nchunks - 1)
-    # The final states is not stored.
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    dstates = rearrange(dstates, "... (p n) -> ... p n", n=dstate)
-    dinitial_states = (
-        rearrange(dinitial_states, "... (p n) -> ... p n", n=dstate)
-        if dinitial_states is not None
-        else None
-    )
-    dx, ddt, dD_from_x = _chunk_scan_chunk_state_bwd_dx(
-        x, dt, dA_cumsum, B, CB, dout, dstates, D=D, seq_idx=seq_idx, dx=dx
-    )
-    # dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, ngroups=ngroups)
-    dB, ddA_next = _chunk_state_bwd_db(
-        x, dt, dA_cumsum, dstates, seq_idx=seq_idx, B=B, ngroups=ngroups
-    )
-    # dC = _chunk_scan_bwd_dC(states[:, :-1].to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    dC, ddA_cumsum_prev = _chunk_scan_bwd_dC(
-        states.to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, C=C, ngroups=ngroups
-    )
-    # Computing ddA with the dcb kernel is much slower, so we're not using it for now
-    dCB = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    # dCB, ddA_tmp = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, CB=CB, ngroups=ngroups)
-    dCB = dCB.to(CB.dtype)
-    _bmm_chunk_bwd(C, dCB, residual=dB, out=dB_given)
-    _bmm_chunk_bwd(B, rearrange(dCB, "... l s -> ... s l"), residual=dC, out=dC_given)
-    # If we have z, then dout_x is recomputed in fp32 so dD = (dout_x * x).sum() is more accurate
-    # than dD_from_x = (dout_x * x).sum() where dout_x is in fp16/bf16
-    if z is None:
-        dD = dD_from_x
-    # Formula for ddA_cumsum, assuming out is the output of the forward pass before adding x * D.
-    # ddA_cumsum = torch.einsum("bclhp,bclhp->bhcl", out.float(), dout.float()) - ddt * dt
-    # However, this is numerically unstable: when we do the reverse cumsum on ddA_cumsum, there might
-    # be a lot of underflow.
-    # This is already done as part of bwd_dC kernel
-    # ddA_cumsum_prev = _chunk_scan_bwd_ddAcs_prev(states[:, :-1], C, dout, dA_cumsum, seq_idx=seq_idx)
-    ddA_cumsum_prev[..., -1] += ddA_chunk_cumsum
-    ddA_prev = ddA_cumsum_prev.flip([-1]).cumsum(dim=-1).flip([-1])
-    # This is already done as part of bwd_dB kernel
-    # ddA_next = _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=seq_idx)
-    # We don't need to pass in seq_idx because CB also zeros out entries where seq_idx[i] != seq_idx[j]
-    ddA = _chunk_scan_bwd_ddAcs_stable(x, dt, dA_cumsum, dout, CB)
-    ddA += ddA_next + ddA_prev
-    ddt_given, dA, ddt_bias = _chunk_cumsum_bwd(
-        ddA,
-        ddt,
-        dt_in,
-        A,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-        ddt=ddt_given,
-    )
-    # These 2 lines are just to test ddt and dA being computed by old code
-    # _, dA = selective_scan_bwd(dout, x, dt, A, B, C, D=D.float(), z=z)
-    # ddt_given.copy_(ddt)
-    return_vals = (
-        dx,
-        ddt_given,
-        dA,
-        dB_given,
-        dC_given,
-        dD,
-        dz,
-        ddt_bias,
-        dinitial_states,
-    )
-    return return_vals if not recompute_output else (*return_vals, outz)
-def selective_scan_bwd(dout, x, dt, A, B, C, D=None, z=None):
-    """
-    Argument:
-        dout: (batch, seqlen, nheads, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size) or (batch, nheads, headdim, nchunks, chunk_size)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    import selective_scan
-    batch, seqlen, nheads, headdim = x.shape
-    chunk_size = dt.shape[-1]
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    x = rearrange(x, "b l h p -> b (h p) l")
-    squeeze_dt = dt.dim() == 4
-    if dt.dim() == 4:
-        dt = repeat(dt, "b h c l -> b h p c l", p=headdim)
-    dt = rearrange(dt, "b h p c l -> b (h p) (c l)", p=headdim)
-    squeeze_A = A.dim() == 1
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    if dt.stride(-1) != 1:
-        dt = dt.contiguous()
-    if D is not None:
-        D = D.contiguous()
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if z is not None and z.stride(-1) != 1:
-        z = z.contiguous()
-    _, intermediate, *rest = selective_scan.fwd(
-        x, dt.to(dtype=x.dtype), A, B, C, D, z, None, False
-    )
-    if z is not None:
-        out = rest[0]
-    else:
-        out = None
-    dout = rearrange(dout, "b l h p -> b (h p) l")
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-    # backward of selective_scan with the backward of chunk).
-    # Here we just pass in None and dz will be allocated in the C++ code.
-    _, ddt, dA, *rest = selective_scan.bwd(
-        x,
-        dt.to(dtype=x.dtype),
-        A,
-        B,
-        C,
-        D,
-        z,
-        None,
-        dout,
-        intermediate,
-        out,
-        None,
-        False,
-        False,  # option to recompute out_z, not used here
-    )
-    ddt = rearrange(ddt, "b (h p) (c l) -> b h p c l", p=headdim, l=chunk_size)
-    if squeeze_dt:
-        ddt = ddt.float().sum(dim=2)
-    if squeeze_A:
-        dA = rearrange(dA, "(h p) n -> h p n", p=headdim).sum(dim=(1, 2))
-    return ddt, dA
-class MambaChunkScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D=None,
-        z=None,
-        dt_bias=None,
-        initial_states=None,
-        seq_idx=None,
-        cu_seqlens=None,
-        dt_softplus=False,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        return_varlen_states=False,
-    ):
-        ctx.dt_dtype = dt.dtype
-        if not return_varlen_states:
-            cu_seqlens = None
-        else:
-            assert (
-                cu_seqlens is not None
-            ), "cu_seqlens must be provided if return_varlen_states is True"
-        out, out_x, dt_out, dA_cumsum, states, final_states, *rest = (
-            _mamba_chunk_scan_combined_fwd(
-                x,
-                dt,
-                A,
-                B,
-                C,
-                chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                seq_idx=seq_idx,
-                cu_seqlens=cu_seqlens,
-                dt_softplus=dt_softplus,
-                dt_limit=dt_limit,
-            )
-        )
-        ctx.save_for_backward(
-            out if z is None else out_x,
-            x,
-            dt,
-            dA_cumsum,
-            A,
-            B,
-            C,
-            D,
-            z,
-            dt_bias,
-            initial_states,
-            seq_idx,
-        )
-        ctx.dt_softplus = dt_softplus
-        ctx.chunk_size = chunk_size
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.return_varlen_states = return_varlen_states
-        if not return_varlen_states:
-            return out if not return_final_states else (out, final_states)
-        else:
-            varlen_states = rest[0]
-            return (
-                (out, varlen_states)
-                if not return_final_states
-                else (out, final_states, varlen_states)
-            )
-    @staticmethod
-    def backward(ctx, dout, *args):
-        out, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx = (
-            ctx.saved_tensors
-        )
-        assert (
-            not ctx.return_varlen_states
-        ), "return_varlen_states is not supported in backward"
-        dfinal_states = args[0] if ctx.return_final_states else None
-        dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = (
-            _mamba_chunk_scan_combined_bwd(
-                dout,
-                x,
-                dt,
-                A,
-                B,
-                C,
-                out,
-                ctx.chunk_size,
-                D=D,
-                z=z,
-                dt_bias=dt_bias,
-                initial_states=initial_states,
-                dfinal_states=dfinal_states,
-                seq_idx=seq_idx,
-                dt_softplus=ctx.dt_softplus,
-                dt_limit=ctx.dt_limit,
-            )
-        )
-        return (
-            dx,
-            ddt,
-            dA,
-            dB,
-            dC,
-            None,
-            dD,
-            dz,
-            ddt_bias,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def mamba_chunk_scan_combined(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    initial_states=None,
-    seq_idx=None,
-    cu_seqlens=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    return_varlen_states=False,
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        chunk_size: int
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen)
-        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
-        dt_softplus: Whether to apply softplus to dt
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    return MambaChunkScanCombinedFn.apply(
-        x,
-        dt,
-        A,
-        B,
-        C,
-        chunk_size,
-        D,
-        z,
-        dt_bias,
-        initial_states,
-        seq_idx,
-        cu_seqlens,
-        dt_softplus,
-        dt_limit,
-        return_final_states,
-        return_varlen_states,
-    )
-def mamba_chunk_scan(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    states = rearrange(
-        state_passing(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    # 3. Compute the output for each chunk
-    out = chunk_scan(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_chunk_scan_combined_ref(
-    x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state_ref(B, x, dt, dA_cumsum)
-    states_dtype = states.dtype
-    if states.dtype not in [torch.float32, torch.float64]:
-        states = states.to(torch.float32)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    # state_passing_ref is much less numerically stable
-    states = rearrange(
-        state_passing_ref(
-            rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]
-        )[0],
-        "... (p n) -> ... p n",
-        n=dstate,
-    )
-    states = states.to(states_dtype)
-    # 3. Compute the output for each chunk
-    out = chunk_scan_ref(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_selective_scan(
-    x,
-    dt,
-    A,
-    B,
-    C,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,) or (nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    from ..selective_scan_interface import selective_scan_fn
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    x = rearrange(x, "b l h p -> b (h p) l")
-    if dt.dim() == 3:
-        dt = repeat(dt, "b l h -> b l h p", p=headdim)
-    dt = rearrange(dt, "b l h p -> b (h p) l")
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if dt_bias is not None:
-        if dt_bias.dim() == 1:
-            dt_bias = repeat(dt_bias, "h -> h p", p=headdim)
-        dt_bias = rearrange(dt_bias, "h p -> (h p)")
-    if dt_limit != (0.0, float("inf")):
-        if dt_bias is not None:
-            dt = dt + rearrange(dt_bias, "d -> d 1")
-        if dt_softplus:
-            dt = F.softplus(dt)
-        dt = dt.clamp(min=dt_limit[0], max=dt_limit[1]).to(x.dtype)
-        dt_bias = None
-        dt_softplus = None
-    out = selective_scan_fn(
-        x, dt, A, B, C, D=D, z=z, delta_bias=dt_bias, delta_softplus=dt_softplus
-    )
-    return rearrange(out, "b (h p) l -> b l h p", p=headdim)
-def mamba_conv1d_scan_ref(
-    xBC,
-    conv1d_weight,
-    conv1d_bias,
-    dt,
-    A,
-    chunk_size,
-    D=None,
-    z=None,
-    dt_bias=None,
-    dt_softplus=False,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    headdim=None,
-    ngroups=1,
-):
-    """
-    Argument:
-        xBC: (batch, seqlen, dim + 2 * ngroups * dstate) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, dim)
-        dt_bias: (nheads) or (nheads, headdim)
-        headdim: if D is 1D and z is None, headdim must be passed in
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    batch, seqlen, nheads = dt.shape[:3]
-    assert nheads % ngroups == 0
-    if z is not None:
-        dim = z.shape[-1]
-        assert dim % nheads == 0
-        headdim = dim // nheads
-    else:
-        if D.dim() == 1:
-            assert headdim is not None
-        else:
-            headdim = D.shape[1]
-        dim = nheads * headdim
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    dstate = (xBC.shape[-1] - dim) // ngroups // 2
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z,
-        dt_bias=dt_bias,
-        dt_softplus=dt_softplus,
-        dt_limit=dt_limit,
-    )
-    return rearrange(out, "b s h p -> b s (h p)")
-class MambaSplitConv1dScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states=None,
-        seq_idx=None,
-        dt_limit=(0.0, float("inf")),
-        return_final_states=False,
-        activation="silu",
-        rmsnorm_weight=None,
-        rmsnorm_eps=1e-6,
-        outproj_weight=None,
-        outproj_bias=None,
-        headdim=None,
-        ngroups=1,
-        norm_before_gate=True,
-    ):
-        assert activation in [None, "silu", "swish"]
-        if D.dim() == 1:
-            assert headdim is not None
-            (nheads,) = D.shape
-        else:
-            nheads, headdim = D.shape
-        batch, seqlen, _ = zxbcdt.shape
-        dim = nheads * headdim
-        assert nheads % ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        assert zxbcdt.shape == (
-            batch,
-            seqlen,
-            2 * d_nonssm + 2 * dim + 2 * ngroups * dstate + nheads,
-        )
-        assert dt_bias.shape == (nheads,)
-        assert A.shape == (nheads,)
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + ngroups * dstate * 2, nheads], dim=-1
-        )
-        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ngroups * dstate, ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-        if rmsnorm_weight is None:
-            out, out_x, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            out = rearrange(out, "b s h p -> b s (h p)")
-            rstd = None
-            if d_nonssm > 0:
-                out = torch.cat([_swiglu_fwd(zx0), out], dim=-1)
-        else:
-            out_x, _, dt_out, dA_cumsum, states, final_states = (
-                _mamba_chunk_scan_combined_fwd(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=dt_limit,
-                )
-            )
-            # reshape input data into 2D tensor
-            x_rms = rearrange(out_x, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            rmsnorm_weight = rmsnorm_weight.contiguous()
-            if d_nonssm == 0:
-                out = None
-            else:
-                out01 = torch.empty(
-                    (batch, seqlen, d_nonssm + dim),
-                    dtype=x_rms.dtype,
-                    device=x_rms.device,
-                )
-                out = rearrange(out01[..., d_nonssm:], "b s d -> (b s) d")
-                _swiglu_fwd(zx0, out=out01[..., :d_nonssm])
-            out, _, rstd = _layer_norm_fwd(
-                x_rms,
-                rmsnorm_weight,
-                None,
-                rmsnorm_eps,
-                z_rms,
-                out=out,
-                group_size=dim // ngroups,
-                norm_before_gate=norm_before_gate,
-                is_rms_norm=True,
-            )
-            if d_nonssm == 0:
-                out = rearrange(out, "(b s) d -> b s d", b=batch)
-            else:
-                out = out01
-        ctx.outproj_weight_dtype = (
-            outproj_weight.dtype if outproj_weight is not None else None
-        )
-        if outproj_weight is not None:
-            if torch.is_autocast_enabled():
-                dtype = torch.get_autocast_gpu_dtype()
-                out, outproj_weight = out.to(dtype), outproj_weight.to(dtype)
-                outproj_bias = (
-                    outproj_bias.to(dtype) if outproj_bias is not None else None
-                )
-            out = F.linear(out, outproj_weight, outproj_bias)
-        else:
-            assert outproj_bias is None
-        ctx.save_for_backward(
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out_x,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        )
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.activation = activation
-        ctx.rmsnorm_eps = rmsnorm_eps
-        ctx.norm_before_gate = norm_before_gate
-        ctx.chunk_size = chunk_size
-        ctx.headdim = headdim
-        ctx.ngroups = ngroups
-        return out if not return_final_states else (out, final_states)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        (
-            zxbcdt,
-            conv1d_weight,
-            conv1d_bias,
-            out,
-            A,
-            D,
-            dt_bias,
-            initial_states,
-            seq_idx,
-            rmsnorm_weight,
-            rstd,
-            outproj_weight,
-            outproj_bias,
-        ) = ctx.saved_tensors
-        dfinal_states = args[0] if ctx.return_final_states else None
-        headdim = ctx.headdim
-        nheads = D.shape[0]
-        dim = nheads * headdim
-        assert nheads % ctx.ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ctx.ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ctx.ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        recompute_output = outproj_weight is not None
-        if recompute_output:
-            out_recompute = torch.empty(
-                *out.shape[:2], d_nonssm + dim, device=out.device, dtype=out.dtype
-            )
-            out0_recompute, out1_recompute = out_recompute.split(
-                [d_nonssm, dim], dim=-1
-            )
-        zx0, z, xBC, dt = torch.split(
-            zxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        # Recompute x, B, C
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(
-                rearrange(xBC, "b s d -> b d s"),
-                conv1d_weight,
-                conv1d_bias,
-                seq_idx,
-                None,
-                None,
-                ctx.activation in ["silu", "swish"],
-            ),
-            "b d s -> b s d",
-        )
-        x, B, C = torch.split(
-            xBC_conv, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ctx.ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dzxbcdt = torch.empty_like(zxbcdt)
-        dzx0, dz, dxBC_given, ddt_given = torch.split(
-            dzxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1
-        )
-        dxBC = torch.empty_like(xBC)
-        dx, dB, dC = torch.split(
-            dxBC, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1
-        )
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-        dx = rearrange(dx, "b l (h p) -> b l h p", h=nheads)
-        dB = rearrange(dB, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dC = rearrange(dC, "b l (g n) -> b l g n", g=ctx.ngroups)
-        if outproj_weight is not None:
-            dout_og = dout
-            dout = F.linear(dout, outproj_weight.t())
-        if d_nonssm > 0:
-            dout0, dout = dout.split([d_nonssm, dim], dim=-1)
-            _swiglu_bwd(zx0, dout0, dxy=dzx0, recompute_output=True, out=out0_recompute)
-        dout = rearrange(dout, "b s (h p) -> b s h p", p=headdim)
-        if rmsnorm_weight is None:
-            dz = rearrange(dz, "b l (h p) -> b l h p", h=nheads)
-            dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states, *rest = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=z,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                    dz=dz,
-                    recompute_output=recompute_output,
-                )
-            )
-            out_for_linear = (
-                rearrange(rest[0], "b s h p -> b s (h p)") if recompute_output else None
-            )
-            drmsnorm_weight = None
-        else:
-            batch = dout.shape[0]
-            dy_rms = rearrange(dout, "b s h p -> (b s) (h p)")
-            dz = rearrange(dz, "b l d -> (b l) d")
-            x_rms = rearrange(out, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            out1_recompute = (
-                rearrange(out1_recompute, "b s d -> (b s) d")
-                if recompute_output
-                else None
-            )
-            dout, drmsnorm_weight, _, dz, *rest = _layer_norm_bwd(
-                dy_rms,
-                x_rms,
-                rmsnorm_weight,
-                None,
-                ctx.rmsnorm_eps,
-                None,
-                rstd,
-                z_rms,
-                group_size=dim // ctx.ngroups,
-                norm_before_gate=ctx.norm_before_gate,
-                is_rms_norm=True,
-                recompute_output=recompute_output,
-                dz=dz,
-                out=out1_recompute if recompute_output else None,
-            )
-            out_for_linear = out_recompute if recompute_output else None
-            dout = rearrange(dout, "(b s) (h p) -> b s h p", b=batch, p=headdim)
-            dx, ddt, dA, dB, dC, dD, _, ddt_bias, dinitial_states = (
-                _mamba_chunk_scan_combined_bwd(
-                    dout,
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    out,
-                    ctx.chunk_size,
-                    D=D,
-                    z=None,
-                    dt_bias=dt_bias,
-                    initial_states=initial_states,
-                    dfinal_states=dfinal_states,
-                    seq_idx=seq_idx,
-                    dt_softplus=True,
-                    dt_limit=ctx.dt_limit,
-                    dx=dx,
-                    ddt=ddt_given,
-                    dB=dB,
-                    dC=dC,
-                )
-            )
-        if outproj_weight is not None:
-            doutproj_weight = torch.einsum("bso,bsd->od", dout_og, out_for_linear)
-            doutproj_bias = (
-                dout_og.sum(dim=(0, 1)) if outproj_bias is not None else None
-            )
-        else:
-            doutproj_weight, doutproj_bias = None, None
-        dxBC_given = rearrange(dxBC_given, "b s d -> b d s")
-        dxBC_given, dweight, dbias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            rearrange(dxBC, "b s d -> b d s"),
-            seq_idx,
-            None,
-            None,
-            dxBC_given,
-            False,
-            ctx.activation in ["silu", "swish"],
-        )
-        dxBC_given = rearrange(dxBC_given, "b d s -> b s d")
-        return (
-            dzxbcdt,
-            dweight,
-            dbias,
-            ddt_bias,
-            dA,
-            dD,
-            None,
-            dinitial_states,
-            None,
-            None,
-            None,
-            None,
-            drmsnorm_weight,
-            None,
-            doutproj_weight,
-            doutproj_bias,
-            None,
-            None,
-            None,
-        )
-def mamba_split_conv1d_scan_combined(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    initial_states=None,
-    seq_idx=None,
-    dt_limit=(0.0, float("inf")),
-    return_final_states=False,
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen), int32
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    return MambaSplitConv1dScanCombinedFn.apply(
-        zxbcdt,
-        conv1d_weight,
-        conv1d_bias,
-        dt_bias,
-        A,
-        D,
-        chunk_size,
-        initial_states,
-        seq_idx,
-        dt_limit,
-        return_final_states,
-        activation,
-        rmsnorm_weight,
-        rmsnorm_eps,
-        outproj_weight,
-        outproj_bias,
-        headdim,
-        ngroups,
-        norm_before_gate,
-    )
-def mamba_split_conv1d_scan_ref(
-    zxbcdt,
-    conv1d_weight,
-    conv1d_bias,
-    dt_bias,
-    A,
-    D,
-    chunk_size,
-    dt_limit=(0.0, float("inf")),
-    activation="silu",
-    rmsnorm_weight=None,
-    rmsnorm_eps=1e-6,
-    outproj_weight=None,
-    outproj_bias=None,
-    headdim=None,
-    ngroups=1,
-    norm_before_gate=True,
-):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    if D.dim() == 1:
-        assert headdim is not None
-        (nheads,) = D.shape
-    else:
-        nheads, headdim = D.shape
-    assert nheads % ngroups == 0
-    batch, seqlen, _ = zxbcdt.shape
-    dim = nheads * headdim
-    dstate = (zxbcdt.shape[-1] - 2 * dim - nheads) // ngroups // 2
-    assert zxbcdt.shape == (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads)
-    assert dt_bias.shape == (nheads,)
-    assert A.shape == (nheads,)
-    if rmsnorm_weight is not None:
-        assert rmsnorm_weight.shape == (dim,)
-    z, xBC, dt = torch.split(zxbcdt, [dim, dim + 2 * ngroups * dstate, nheads], dim=-1)
-    xBC = rearrange(
-        causal_conv1d_fn(
-            rearrange(xBC, "b s d -> b d s"),
-            conv1d_weight,
-            conv1d_bias,
-            activation=activation,
-        ),
-        "b d s -> b s d",
-    )
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-    out = ssd_selective_scan(
-        x,
-        dt.to(x.dtype),
-        A,
-        B,
-        C,
-        D=D.float(),
-        z=z if rmsnorm_weight is None else None,
-        dt_bias=dt_bias,
-        dt_softplus=True,
-        dt_limit=dt_limit,
-    )
-    out = rearrange(out, "b s h p -> b s (h p)")
-    if rmsnorm_weight is not None:
-        out = rmsnorm_fn(
-            out,
-            rmsnorm_weight,
-            None,
-            z=rearrange(z, "b l h p -> b l (h p)"),
-            eps=rmsnorm_eps,
-            norm_before_gate=norm_before_gate,
-        )
-    if outproj_weight is not None:
-        out = F.linear(out, outproj_weight, outproj_bias)
-    return out

build/torch25-cxx98-cu118-x86_64-linux/mamba_ssm/utils/__init__.py DELETED Viewed

File without changes

build/torch25-cxx98-cu121-x86_64-linux/mamba_ssm/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-__version__ = "2.2.4"
-from .ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-from .modules.mamba_simple import Mamba
-from .modules.mamba2 import Mamba2
-from .models.mixer_seq_simple import MambaLMHeadModel
-__all__ = [
-    "selective_scan_fn",
-    "mamba_inner_fn",
-    "Mamba",
-    "Mamba2",
-    "MambaLMHeadModel",
-]