diff --git a/README.md b/README.md
index 4f7f9af39121839bb024750cfabb6c071a3c07f1..b09c625b97e427efca23e21e9646168ff37af838 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 ---
 title: VideoMamba
-emoji: 🌖
-colorFrom: gray
-colorTo: pink
+emoji: 🐍
+colorFrom: blue
+colorTo: green
 sdk: gradio
-sdk_version: 4.21.0
+sdk_version: 3.29.0
 app_file: app.py
 pinned: false
 license: apache-2.0
diff --git a/__pycache__/imagenet_class_index.cpython-310.pyc b/__pycache__/imagenet_class_index.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d64027ffc7eb3f1fa4a0dee54322c7bc9c0d8090
Binary files /dev/null and b/__pycache__/imagenet_class_index.cpython-310.pyc differ
diff --git a/__pycache__/kinetics_class_index.cpython-310.pyc b/__pycache__/kinetics_class_index.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a60bb523534163496896bee5e1a288480abdba6
Binary files /dev/null and b/__pycache__/kinetics_class_index.cpython-310.pyc differ
diff --git a/__pycache__/transforms.cpython-310.pyc b/__pycache__/transforms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec70fceee33c7c1ed22a17f285e7eff49b10ad2e
Binary files /dev/null and b/__pycache__/transforms.cpython-310.pyc differ
diff --git a/__pycache__/videomamba_image.cpython-310.pyc b/__pycache__/videomamba_image.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63fb562f07d2d0db094219e15289d80e2f44919d
Binary files /dev/null and b/__pycache__/videomamba_image.cpython-310.pyc differ
diff --git a/__pycache__/videomamba_video.cpython-310.pyc b/__pycache__/videomamba_video.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b8abea20d021b0cb8f4f7bd195f99a4bf421e55
Binary files /dev/null and b/__pycache__/videomamba_video.cpython-310.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dde9587a53c230d455141a7e85e42ae260ba68a
--- /dev/null
+++ b/app.py
@@ -0,0 +1,180 @@
+import os
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+import torchvision.transforms as T
+from PIL import Image
+from decord import VideoReader
+from decord import cpu
+from videomamba_image import videomamba_image_tiny
+from videomamba_video import videomamba_tiny
+from kinetics_class_index import kinetics_classnames
+from imagenet_class_index import imagenet_classnames
+from transforms import (
+    GroupNormalize, GroupScale, GroupCenterCrop, 
+    Stack, ToTorchFormatTensor
+)
+
+import gradio as gr
+from huggingface_hub import hf_hub_download
+
+
+# install packages for mamba
+os.system("bash install.sh")
+
+
+# Device on which to run the model
+# Set to cuda to load on GPU
+device = "cuda"
+model_video_path = hf_hub_download(repo_id="OpenGVLab/VideoMamba", filename="videomamba_t16_k400_f16_res224.pth")
+model_image_path = hf_hub_download(repo_id="OpenGVLab/VideoMamba", filename="videomamba_t16_in1k_res224.pth")
+# Pick a pretrained model 
+model_video = videomamba_tiny(num_classes=400, num_frames=16)
+video_sd = torch.load(model_video_path, map_location='cpu')
+model_video.load_state_dict(video_sd)
+model_image = videomamba_image_tiny()
+image_sd = torch.load(model_image_path, map_location='cpu')
+model_image.load_state_dict(image_sd['model'])
+# Set to eval mode and move to desired device
+model_video = model_video.to(device).eval()
+model_image = model_image.to(device).eval()
+
+# Create an id to label name mapping
+kinetics_id_to_classname = {}
+for k, v in kinetics_classnames.items():
+    kinetics_id_to_classname[k] = v
+imagenet_id_to_classname = {}
+for k, v in imagenet_classnames.items():
+    imagenet_id_to_classname[k] = v[1] 
+
+
+def get_index(num_frames, num_segments=8):
+    seg_size = float(num_frames - 1) / num_segments
+    start = int(seg_size / 2)
+    offsets = np.array([
+        start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+    ])
+    return offsets
+
+
+def load_video(video_path):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    num_frames = len(vr)
+    frame_indices = get_index(num_frames, 16)
+
+    # transform
+    crop_size = 160
+    scale_size = 160
+    input_mean = [0.485, 0.456, 0.406]
+    input_std = [0.229, 0.224, 0.225]
+
+    transform = T.Compose([
+        GroupScale(int(scale_size)),
+        GroupCenterCrop(crop_size),
+        Stack(),
+        ToTorchFormatTensor(),
+        GroupNormalize(input_mean, input_std) 
+    ])
+
+    images_group = list()
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy())
+        images_group.append(img)
+    torch_imgs = transform(images_group)
+    return torch_imgs
+    
+
+def inference_video(video):
+    vid = load_video(video)
+    
+    # The model expects inputs of shape: B x C x H x W
+    TC, H, W = vid.shape
+    inputs = vid.reshape(1, TC//3, 3, H, W).permute(0, 2, 1, 3, 4)
+    
+    with torch.no_grad():
+        prediction = model_video(inputs.to(device))
+        prediction = F.softmax(prediction, dim=1).flatten()
+
+    return {kinetics_id_to_classname[str(i)]: float(prediction[i]) for i in range(400)}
+    
+
+def set_example_video(example: list) -> dict:
+    return gr.Video.update(value=example[0])
+
+
+def inference_image(img):
+    image = img
+    image_transform = T.Compose(
+    [
+        T.Resize(224),
+        T.CenterCrop(224),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+    )
+    image = image_transform(image)
+    
+    # The model expects inputs of shape: B x C x H x W
+    image = image.unsqueeze(0)
+    
+    with torch.no_grad():
+        prediction = model_image(image.to(device))
+        prediction = F.softmax(prediction, dim=1).flatten()
+
+    return {imagenet_id_to_classname[str(i)]: float(prediction[i]) for i in range(1000)}
+
+
+def set_example_image(example: list) -> dict:
+    return gr.Image.update(value=example[0])
+
+
+demo = gr.Blocks()
+with demo:
+    gr.Markdown(
+        """
+        # VideoMamba-Ti
+        Gradio demo for <a href='https://github.com/OpenGVLab/VideoMamba' target='_blank'>VideoMamba</a>: To use it, simply upload your video, or click one of the examples to load them. Read more at the links below.
+        """
+    )
+
+    with gr.Tab("Video"):
+        with gr.Box():
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        input_video = gr.Video(label='Input Video').style(height=360)
+                    with gr.Row():
+                        submit_video_button = gr.Button('Submit')
+                with gr.Column():
+                        label_video = gr.Label(num_top_classes=5)
+            with gr.Row():
+                example_videos = gr.Dataset(components=[input_video], samples=[['./videos/hitting_baseball.mp4'], ['./videos/hoverboarding.mp4'], ['./videos/yoga.mp4']])
+        
+    with gr.Tab("Image"):
+        with gr.Box():
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        input_image = gr.Image(label='Input Image', type='pil').style(height=360)
+                    with gr.Row():
+                        submit_image_button = gr.Button('Submit')
+                with gr.Column():
+                    label_image = gr.Label(num_top_classes=5)
+            with gr.Row():
+                example_images = gr.Dataset(components=[input_image], samples=[['./images/cat.png'], ['./images/dog.png'], ['./images/panda.png']])
+
+    gr.Markdown(
+        """
+        <p style='text-align: center'><a href='https://arxiv.org/abs/2403.06977' target='_blank'>VideoMamba: State Space Model for Efficient Video Understanding</a> | <a href='https://github.com/OpenGVLab/VideoMamba' target='_blank'>Github Repo</a></p>
+        """
+    )
+
+    submit_video_button.click(fn=inference_video, inputs=input_video, outputs=label_video)
+    example_videos.click(fn=set_example_video, inputs=example_videos, outputs=example_videos.components)
+    submit_image_button.click(fn=inference_image, inputs=input_image, outputs=label_image)
+    example_images.click(fn=set_example_image, inputs=example_images, outputs=example_images.components)
+
+demo.launch(enable_queue=True)
+# demo.launch(server_name="0.0.0.0", server_port=10034, enable_queue=True)
\ No newline at end of file
diff --git a/causal-conv1d/AUTHORS b/causal-conv1d/AUTHORS
new file mode 100644
index 0000000000000000000000000000000000000000..88193855314bb723ced1860384e417954f559700
--- /dev/null
+++ b/causal-conv1d/AUTHORS
@@ -0,0 +1 @@
+Tri Dao, tri@tridao.me
diff --git a/causal-conv1d/LICENSE b/causal-conv1d/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..5860e4b33f3d9d85fc636137c559331d51783a5b
--- /dev/null
+++ b/causal-conv1d/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/causal-conv1d/README.md b/causal-conv1d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4e905425a650d77c5c4854e4c4a261778c4d2690
--- /dev/null
+++ b/causal-conv1d/README.md
@@ -0,0 +1 @@
+# Causal depthwise conv1d in CUDA with a PyTorch interface
diff --git a/causal-conv1d/causal_conv1d/__init__.py b/causal-conv1d/causal_conv1d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc4d610a1e557cabd723fb6e33438f03c5c4bf66
--- /dev/null
+++ b/causal-conv1d/causal_conv1d/__init__.py
@@ -0,0 +1,3 @@
+__version__ = "1.0.0"
+
+from causal_conv1d.causal_conv1d_interface import causal_conv1d_fn, causal_conv1d_update
diff --git a/causal-conv1d/causal_conv1d/causal_conv1d_interface.py b/causal-conv1d/causal_conv1d/causal_conv1d_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..f66143c39e767572ca12112811a384239b8beb63
--- /dev/null
+++ b/causal-conv1d/causal_conv1d/causal_conv1d_interface.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2023, Tri Dao.
+
+import torch
+import torch.nn.functional as F
+
+
+import causal_conv1d_cuda
+
+
+class CausalConv1dFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias=None, activation=None):
+        if activation not in [None, "silu", "swish"]:
+            raise NotImplementedError("activation must be None, silu, or swish")
+        if x.stride(2) != 1 and x.stride(1) != 1:
+            x = x.contiguous()
+        bias = bias.contiguous() if bias is not None else None
+        ctx.save_for_backward(x, weight, bias)
+        ctx.activation = activation in ["silu", "swish"]
+        out = causal_conv1d_cuda.causal_conv1d_fwd(x, weight, bias, ctx.activation)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        x, weight, bias = ctx.saved_tensors
+        if dout.stride(2) != 1 and dout.stride(1) != 1:
+            dout = dout.contiguous()
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        # Here we just pass in None and dx will be allocated in the C++ code.
+        dx, dweight, dbias = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, weight, bias, dout, None, ctx.activation
+        )
+        return dx, dweight, dbias if bias is not None else None, None
+
+
+def causal_conv1d_fn(x, weight, bias=None, activation=None):
+    """
+    x: (batch, dim, seqlen)
+    weight: (dim, width)
+    bias: (dim,)
+    activation: either None or "silu" or "swish"
+
+    out: (batch, dim, seqlen)
+    """
+    return CausalConv1dFn.apply(x, weight, bias, activation)
+
+
+def causal_conv1d_ref(x, weight, bias=None, activation=None):
+    """
+    x: (batch, dim, seqlen)
+    weight: (dim, width)
+    bias: (dim,)
+
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    x = x.to(weight.dtype)
+    seqlen = x.shape[-1]
+    dim, width = weight.shape
+    out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+    out = out[..., :seqlen]
+    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+
+
+def causal_conv1d_update(x, conv_state, weight, bias=None, activation=None):
+    """
+    x: (batch, dim)
+    conv_state: (batch, dim, width)
+    weight: (dim, width)
+    bias: (dim,)
+
+    out: (batch, dim)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    activation = activation in ["silu", "swish"]
+    return causal_conv1d_cuda.causal_conv1d_update(x, conv_state, weight, bias, activation)
+
+
+def causal_conv1d_update_ref(x, conv_state, weight, bias=None, activation=None):
+    """
+    x: (batch, dim)
+    conv_state: (batch, dim, width)
+    weight: (dim, width)
+    bias: (dim,)
+
+    out: (batch, dim)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    batch, dim = x.shape
+    width = weight.shape[1]
+    assert conv_state.shape == (batch, dim, width)
+    assert weight.shape == (dim, width)
+    conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1)) # Update state (B D W)
+    conv_state[:, :, -1] = x
+    out = torch.sum(conv_state * weight, dim=-1) # (B D)
+    if bias is not None:
+        out += bias
+    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
diff --git a/causal-conv1d/csrc/causal_conv1d.cpp b/causal-conv1d/csrc/causal_conv1d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c80516ac8599d4d80910a1d4d85c4c435cf1e4f
--- /dev/null
+++ b/causal-conv1d/csrc/causal_conv1d.cpp
@@ -0,0 +1,333 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include "causal_conv1d.h"
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+#define DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)                    \
+    if (ITYPE == at::ScalarType::Half) {                                            \
+        using input_t = at::Half;                                                   \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
+        using input_t = at::BFloat16;                                               \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::Float)  {                                   \
+        using input_t = float;                                                      \
+        __VA_ARGS__();                                                              \
+    } else {                                                                        \
+        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
+    }
+
+#define DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(WTYPE, NAME, ...)                     \
+    if (WTYPE == at::ScalarType::Half) {                                             \
+        using weight_t = at::Half;                                                   \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::BFloat16) {                                  \
+        using weight_t = at::BFloat16;                                               \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::Float)  {                                    \
+        using weight_t = float;                                                      \
+        __VA_ARGS__();                                                               \
+    } else {                                                                         \
+        AT_ERROR(#NAME, " not implemented for weight type '", toString(WTYPE), "'"); \
+    }
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
+template <typename input_t, typename weight_t>
+void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream);
+template<typename input_t, typename weight_t>
+void causal_conv1d_channellast_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream);
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream);
+
+void set_conv_params_fwd(ConvParamsBase &params,
+                         // sizes
+                         const size_t batch,
+                         const size_t dim,
+                         const size_t seqlen,
+                         const size_t width,
+                         // device pointers
+                         const at::Tensor x,
+                         const at::Tensor weight,
+                         const at::Tensor out,
+                         void* bias_ptr,
+                         bool silu_activation) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.batch = batch;
+    params.dim = dim;
+    params.seqlen = seqlen;
+    params.width = width;
+
+    params.silu_activation = silu_activation;
+
+    // Set the pointers and strides.
+    params.x_ptr = x.data_ptr();
+    params.weight_ptr = weight.data_ptr();
+    params.bias_ptr = bias_ptr;
+    params.out_ptr = out.data_ptr();
+    // All stride are in elements, not bytes.
+    params.x_batch_stride = x.stride(0);
+    params.x_c_stride = x.stride(1);
+    params.x_l_stride = x.stride(-1);
+    params.weight_c_stride = weight.stride(0);
+    params.weight_width_stride = weight.stride(1);
+    params.out_batch_stride = out.stride(0);
+    params.out_c_stride = out.stride(1);
+    params.out_l_stride = out.stride(-1);
+}
+
+
+void set_conv_params_bwd(ConvParamsBwd &params,
+                         // sizes
+                         const size_t batch,
+                         const size_t dim,
+                         const size_t seqlen,
+                         const size_t width,
+                         // device pointers
+                         const at::Tensor x,
+                         const at::Tensor weight,
+                         void* bias_ptr,
+                         const at::Tensor dout,
+                         const at::Tensor dx,
+                         const at::Tensor dweight,
+                         void* dbias_ptr,
+                         bool silu_activation) {
+    // Pass in "dout" instead of "out", we're not gonna use "out" at all.
+    set_conv_params_fwd(params, batch, dim, seqlen, width,
+                        x, weight, dout, bias_ptr, silu_activation);
+
+    // Set the pointers and strides.
+    params.dout_ptr = dout.data_ptr();
+    params.dx_ptr = dx.data_ptr();
+    params.dweight_ptr = dweight.data_ptr();
+    params.dbias_ptr = dbias_ptr;
+    // All stride are in elements, not bytes.
+    params.dout_batch_stride = dout.stride(0);
+    params.dout_c_stride = dout.stride(1);
+    params.dout_l_stride = dout.stride(2);
+    params.dweight_c_stride = dweight.stride(0);
+    params.dweight_width_stride = dweight.stride(1);
+    params.dx_batch_stride = dx.stride(0);
+    params.dx_c_stride = dx.stride(1);
+    params.dx_l_stride = dx.stride(2);
+}
+
+at::Tensor
+causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
+                  const c10::optional<at::Tensor> &bias_,
+                  bool silu_activation) {
+    auto input_type = x.scalar_type();
+    auto weight_type = weight.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(weight.is_cuda());
+
+    const auto sizes = x.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int width = weight.size(-1);
+
+    CHECK_SHAPE(x, batch_size, dim, seqlen);
+    CHECK_SHAPE(weight, dim, width);
+
+    TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
+    const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
+
+    if (is_channel_last) {
+        TORCH_CHECK(dim % 8 == 0, "causal_conv1d only supports channel dimension divisible by 8 for now");
+    }
+    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
+
+
+    if (bias_.has_value()) {
+        auto bias = bias_.value();
+        TORCH_CHECK(bias.scalar_type() == weight_type);
+        TORCH_CHECK(bias.is_cuda());
+        TORCH_CHECK(bias.stride(-1) == 1);
+        CHECK_SHAPE(bias, dim);
+    }
+
+    at::Tensor out = torch::empty_like(x);
+
+    ConvParamsBase params;
+    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
+                        bias_.has_value() ? bias_.value().data_ptr() : nullptr,
+                        silu_activation);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
+        DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(weight.scalar_type(), "causal_conv1d_fwd", [&] {
+            if (!is_channel_last) {
+                causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
+            } else {
+                causal_conv1d_channellast_fwd_cuda<input_t, weight_t>(params, stream);
+            }
+        });
+    });
+    return out;
+}
+
+std::vector<at::Tensor>
+causal_conv1d_bwd(const at::Tensor &x, const at::Tensor &weight,
+                  const c10::optional<at::Tensor> &bias_,
+                  at::Tensor &dout,
+                  c10::optional<at::Tensor> &dx_,
+                  bool silu_activation) {
+    auto input_type = x.scalar_type();
+    auto weight_type = weight.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(weight.is_cuda());
+    TORCH_CHECK(dout.is_cuda());
+
+    const auto sizes = x.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int width = weight.size(-1);
+
+    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
+
+    CHECK_SHAPE(x, batch_size, dim, seqlen);
+    CHECK_SHAPE(weight, dim, width);
+    CHECK_SHAPE(dout, batch_size, dim, seqlen);
+
+    TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
+    const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
+    if (!is_channel_last && dout.stride(2) != 1) { dout = dout.contiguous(); }
+    if (is_channel_last && dout.stride(1) != 1) { dout = dout.transpose(-1, -2).contiguous().transpose(-1, -2); }
+
+    if (bias_.has_value()) {
+        auto bias = bias_.value();
+        TORCH_CHECK(bias.scalar_type() == weight_type);
+        TORCH_CHECK(bias.is_cuda());
+        TORCH_CHECK(bias.stride(-1) == 1);
+        CHECK_SHAPE(bias, dim);
+    }
+
+    at::Tensor dx;
+    if (dx_.has_value()) {
+        dx = dx_.value();
+        TORCH_CHECK(dx.scalar_type() == input_type);
+        TORCH_CHECK(dx.is_cuda());
+        CHECK_SHAPE(dx, batch_size, dim, seqlen);
+        if (!is_channel_last) { TORCH_CHECK(dx.stride(2) == 1); }
+        if (is_channel_last) { TORCH_CHECK(dx.stride(1) == 1); }
+    } else {
+        dx = torch::empty_like(x);
+    }
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+
+    at::Tensor dweight = torch::zeros_like(weight, weight.options().dtype(at::kFloat));
+    at::Tensor dbias;
+    if (bias_.has_value()) { dbias = torch::zeros_like(bias_.value(), bias_.value().options().dtype(at::kFloat)); }
+
+    ConvParamsBwd params;
+    set_conv_params_bwd(params, batch_size, dim, seqlen, width,
+                        x, weight, bias_.has_value() ? bias_.value().data_ptr() : nullptr,
+                        dout, dx, dweight, bias_.has_value() ? dbias.data_ptr() : nullptr,
+                        silu_activation);
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_bwd", [&] {
+        DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(weight.scalar_type(), "causal_conv1d_bwd", [&] {
+            if (!is_channel_last) {
+                causal_conv1d_bwd_cuda<input_t, weight_t>(params, stream);
+            } else {
+                causal_conv1d_channellast_bwd_cuda<input_t, weight_t>(params, stream);
+            }
+        });
+    });
+    return {dx, dweight.to(weight.dtype()), bias_.has_value() ? dbias.to(bias_.value().dtype()) : dbias};
+}
+
+at::Tensor
+causal_conv1d_update(const at::Tensor &x,
+                     const at::Tensor &conv_state,
+                     const at::Tensor &weight,
+                     const c10::optional<at::Tensor> &bias_,
+                  bool silu_activation) {
+    auto input_type = x.scalar_type();
+    auto weight_type = weight.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(conv_state.scalar_type() == input_type);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(conv_state.is_cuda());
+    TORCH_CHECK(weight.is_cuda());
+
+    const auto sizes = x.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int width = weight.size(-1);
+
+    CHECK_SHAPE(x, batch_size, dim);
+    CHECK_SHAPE(conv_state, batch_size, dim, width);
+    CHECK_SHAPE(weight, dim, width);
+
+    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
+
+    if (bias_.has_value()) {
+        auto bias = bias_.value();
+        TORCH_CHECK(bias.scalar_type() == weight_type);
+        TORCH_CHECK(bias.is_cuda());
+        TORCH_CHECK(bias.stride(-1) == 1);
+        CHECK_SHAPE(bias, dim);
+    }
+
+    at::Tensor out = torch::empty_like(x);
+
+    ConvParamsBase params;
+    set_conv_params_fwd(params, batch_size, dim, /*seqlen=*/1, width, x, weight, out,
+                        bias_.has_value() ? bias_.value().data_ptr() : nullptr,
+                        silu_activation);
+    params.conv_state_ptr = conv_state.data_ptr();
+    // All stride are in elements, not bytes.
+    params.conv_state_batch_stride = conv_state.stride(0);
+    params.conv_state_c_stride = conv_state.stride(1);
+    params.conv_state_l_stride = conv_state.stride(2);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
+        DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(weight.scalar_type(), "causal_conv1d_update", [&] {
+            causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
+        });
+    });
+    return out;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("causal_conv1d_fwd", &causal_conv1d_fwd, "Causal conv1d forward");
+    m.def("causal_conv1d_bwd", &causal_conv1d_bwd, "Causal conv1d backward");
+    m.def("causal_conv1d_update", &causal_conv1d_update, "Causal conv1d update");
+}
diff --git a/causal-conv1d/csrc/causal_conv1d.h b/causal-conv1d/csrc/causal_conv1d.h
new file mode 100644
index 0000000000000000000000000000000000000000..844ed92cfc91a881e58fccfca001a13ebcc434cc
--- /dev/null
+++ b/causal-conv1d/csrc/causal_conv1d.h
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct ConvParamsBase {
+    using index_t = uint32_t;
+
+    int batch, dim, seqlen, width;
+    bool silu_activation;
+
+    index_t x_batch_stride;
+    index_t x_c_stride;
+    index_t x_l_stride;
+    index_t weight_c_stride;
+    index_t weight_width_stride;
+    index_t out_batch_stride;
+    index_t out_c_stride;
+    index_t out_l_stride;
+
+    index_t conv_state_batch_stride;
+    index_t conv_state_c_stride;
+    index_t conv_state_l_stride;
+
+    // Common data pointers.
+    void *__restrict__ x_ptr;
+    void *__restrict__ weight_ptr;
+    void *__restrict__ bias_ptr;
+    void *__restrict__ out_ptr;
+
+    void *__restrict__ conv_state_ptr;
+};
+
+struct ConvParamsBwd: public ConvParamsBase {
+    index_t dx_batch_stride;
+    index_t dx_c_stride;
+    index_t dx_l_stride;
+    index_t dweight_c_stride;
+    index_t dweight_width_stride;
+    index_t dout_batch_stride;
+    index_t dout_c_stride;
+    index_t dout_l_stride;
+
+    // Common data pointers.
+    void *__restrict__ dx_ptr;
+    void *__restrict__ dweight_ptr;
+    void *__restrict__ dbias_ptr;
+    void *__restrict__ dout_ptr;
+};
+
diff --git a/causal-conv1d/csrc/causal_conv1d_bwd.cu b/causal-conv1d/csrc/causal_conv1d_bwd.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66609750a30a86a284451871ca163d79a0529047
--- /dev/null
+++ b/causal-conv1d/csrc/causal_conv1d_bwd.cu
@@ -0,0 +1,525 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_reduce.cuh>
+
+#include "causal_conv1d.h"
+#include "causal_conv1d_common.h"
+#include "static_switch.h"
+
+template<int kNThreads_, int kWidth_, bool kSiluAct_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_bwd_kernel_traits {
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kWidth = kWidth_;
+    static constexpr bool kSiluAct = kSiluAct_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
+    static_assert(kWidth <= kNElts);
+    // It's possible that we need to do 2 rounds of exchange if input_t is 16 bits
+    // (since then we'd have 8 values of float, and each round we can exchange 4 floats).
+    static constexpr int kNExchangeRounds = sizeof(float) / sizeof(input_t);
+    static constexpr bool kIsVecLoad = kIsVecLoad_;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
+    using BlockReduceFloatT = cub::BlockReduce<float, kNThreads>;
+    static constexpr int kSmemIOSize = kIsVecLoad
+        ? 0
+        : std::max({sizeof(typename BlockLoadT::TempStorage), sizeof(typename BlockStoreT::TempStorage)});
+    static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts * (!kSiluAct ? 1 : kNExchangeRounds + 1);
+    static constexpr int kSmemSize = std::max({kSmemExchangeSize,
+            int(sizeof(typename BlockReduceFloatT::TempStorage))}) + (kIsVecLoad ? 0 : kSmemIOSize);
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_bwd_kernel(ConvParamsBwd params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr bool kSiluAct = Ktraits::kSiluAct;
+    constexpr int kNElts = Ktraits::kNElts;
+    constexpr int kNExchangeRounds = Ktraits::kNExchangeRounds;
+    constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
+    using input_t = typename Ktraits::input_t;
+    using vec_t = typename Ktraits::vec_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
+    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
+    vec_t *smem_exchange_x = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize) + kNThreads * kNExchangeRounds;
+    auto& smem_reduce_float = *reinterpret_cast<typename Ktraits::BlockReduceFloatT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
+
+    const int tidx = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    const int dim_id = blockIdx.y;
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+        + dim_id * params.x_c_stride;
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + dim_id * params.weight_c_stride;
+    input_t *dout = reinterpret_cast<input_t *>(params.dout_ptr) + batch_id * params.dout_batch_stride
+        + dim_id * params.dout_c_stride;
+    input_t *dx = reinterpret_cast<input_t *>(params.dx_ptr) + batch_id * params.dx_batch_stride
+        + dim_id * params.dx_c_stride;
+    float *dweight = reinterpret_cast<float *>(params.dweight_ptr) + dim_id * params.dweight_c_stride;
+    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[dim_id]);
+
+    // Thread kNThreads - 1 will load the first elements of the next chunk so we initialize those to 0.
+    if (tidx == 0) {
+        if constexpr (!kSiluAct) {
+            input_t zeros[kNElts] = {0};
+            smem_exchange[0] = reinterpret_cast<vec_t *>(zeros)[0];
+        } else {
+            float zeros[kNElts] = {0};
+            #pragma unroll
+            for (int r = 0; r < kNExchangeRounds; ++r) {
+                smem_exchange[r * kNThreads] = reinterpret_cast<vec_t *>(zeros)[r];
+            }
+        }
+    }
+
+    float weight_vals[kWidth];
+    #pragma unroll
+    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = weight[i * params.weight_width_stride]; }
+
+    float dweight_vals[kWidth] = {0};
+    float dbias_val = 0;
+
+    constexpr int kChunkSize = kNThreads * kNElts;
+    const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
+    x += (n_chunks - 1) * kChunkSize;
+    dout += (n_chunks - 1) * kChunkSize;
+    dx += (n_chunks - 1) * kChunkSize;
+    for (int chunk = n_chunks - 1; chunk >= 0; --chunk) {
+        input_t x_vals_load[2 * kNElts] = {0};
+        input_t dout_vals_load[2 * kNElts] = {0};
+        if constexpr(kIsVecLoad) {
+            Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts);
+            Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(dout), *reinterpret_cast<vec_t (*)[1]>(&dout_vals_load[0]), (params.seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            __syncthreads();
+            Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize);
+            __syncthreads();
+            Ktraits::BlockLoadT(smem_load).Load(dout, *reinterpret_cast<input_t (*)[kNElts]>(&dout_vals_load[0]), params.seqlen - chunk * kChunkSize);
+        }
+        float dout_vals[2 * kNElts], x_vals[2 * kNElts];
+        if constexpr (!kSiluAct) {
+            __syncthreads();
+            // Thread 0 don't write yet, so that thread kNThreads - 1 can read
+            // the first elements of the next chunk.
+            if (tidx > 0) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(dout_vals_load)[0]; }
+            __syncthreads();
+            reinterpret_cast<vec_t *>(dout_vals_load)[1] = smem_exchange[tidx < kNThreads - 1 ? tidx + 1 : 0];
+            __syncthreads();
+            // Now thread 0 can write the first elements of the current chunk.
+            if (tidx == 0) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(dout_vals_load)[0]; }
+            #pragma unroll
+            for (int i = 0; i < 2 * kNElts; ++i) {
+                dout_vals[i] = float(dout_vals_load[i]);
+                x_vals[i] = float(x_vals_load[i]);
+            }
+        } else {
+            if (tidx == 0 && chunk > 0) {
+                if constexpr(kIsVecLoad) {
+                    reinterpret_cast<vec_t *>(x_vals_load)[0] = reinterpret_cast<vec_t *>(x)[-1];
+                } else {
+                    #pragma unroll
+                    for (int i = 0; i < kNElts; ++i) {
+                        if (chunk * kChunkSize + i < params.seqlen) { x_vals_load[i] = x[-kNElts + i]; }
+                    }
+                }
+            }
+            __syncthreads();
+            smem_exchange_x[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1];
+            __syncthreads();
+            if (tidx > 0) { reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange_x[tidx - 1]; }
+            #pragma unroll
+            for (int i = 0; i < 2 * kNElts; ++i) { x_vals[i] = float(x_vals_load[i]); }
+            // Recompute the output
+            #pragma unroll
+            for (int i = 0; i < kNElts; ++i) {
+                float out_val = bias_val;
+                #pragma unroll
+                for (int w = 0; w < kWidth; ++w) {
+                    out_val += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
+                }
+                float out_sigmoid_val = 1.0f / (1.0f + expf(-out_val));
+                dout_vals[i] = float(dout_vals_load[i]) * out_sigmoid_val
+                               * (1.0f + out_val * (1.0f - out_sigmoid_val));
+            }
+            // Exchange the dout_vals. It's possible that we need to do 2 rounds of exchange
+            // if input_t is 16 bits (since then we'd have 8 values of float)
+            __syncthreads();
+            // Thread 0 don't write yet, so that thread kNThreads - 1 can read
+            // the first elements of the next chunk.
+            if (tidx > 0) {
+                #pragma unroll
+                for (int r = 0; r < kNExchangeRounds; ++r) {
+                    smem_exchange[r * kNThreads + tidx] = reinterpret_cast<vec_t *>(dout_vals)[r];
+                }
+            }
+            __syncthreads();
+            #pragma unroll
+            for (int r = 0; r < kNExchangeRounds; ++r) {
+                reinterpret_cast<vec_t *>(dout_vals)[kNExchangeRounds + r]
+                    = smem_exchange[r * kNThreads + (tidx < kNThreads - 1 ? tidx + 1 : 0)];
+            }
+            __syncthreads();
+            // Now thread 0 can write the first elements of the current chunk.
+            if (tidx == 0) {
+                #pragma unroll
+                for (int r = 0; r < kNExchangeRounds; ++r) {
+                    smem_exchange[r * kNThreads + tidx] = reinterpret_cast<vec_t *>(dout_vals)[r];
+                }
+            }
+        }
+        dout -= kChunkSize;
+        x -= kChunkSize;
+
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) { dbias_val += dout_vals[i]; }
+
+        float dx_vals[kNElts] = {0};
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) {
+            #pragma unroll
+            for (int w = 0; w < kWidth; ++w) {
+                dx_vals[i] += weight_vals[w] * dout_vals[i + kWidth - w - 1];
+            }
+        }
+
+        input_t dx_vals_store[kNElts];
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) { dx_vals_store[i] = dx_vals[i]; }
+        if constexpr(kIsVecLoad) {
+            Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(dx), reinterpret_cast<vec_t (&)[1]>(dx_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            Ktraits::BlockStoreT(smem_store).Store(dx, dx_vals_store, params.seqlen - chunk * kChunkSize);
+        }
+        dx -= kChunkSize;
+
+        #pragma unroll
+        for (int w = 0; w < kWidth; ++w) {
+            #pragma unroll
+            for (int i = 0; i < kNElts; ++i) {
+                dweight_vals[w] += x_vals[kNElts + i] * dout_vals[i + kWidth - w - 1];
+            }
+        }
+    }
+
+    #pragma unroll
+    for (int w = 0; w < kWidth; ++w) {
+        __syncthreads();
+        dweight_vals[w] = Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dweight_vals[w]);
+        if (tidx == 0) {
+            atomicAdd(&reinterpret_cast<float *>(dweight)[w * params.dweight_width_stride], dweight_vals[w]);
+        }
+    }
+    if (params.bias_ptr != nullptr) {
+        __syncthreads();
+        dbias_val = Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dbias_val);
+        if (tidx == 0) {
+            atomicAdd(&reinterpret_cast<float *>(params.dbias_ptr)[dim_id], dbias_val);
+        }
+    }
+}
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_bwd_launch(ConvParamsBwd &params, cudaStream_t stream) {
+    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
+    BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
+        BOOL_SWITCH(params.silu_activation, kSiluAct, [&] {
+            using Ktraits = Causal_conv1d_bwd_kernel_traits<kNThreads, kWidth, kSiluAct, kIsVecLoad, input_t, weight_t>;
+            constexpr int kSmemSize = Ktraits::kSmemSize;
+            dim3 grid(params.batch, params.dim);
+            auto kernel = &causal_conv1d_bwd_kernel<Ktraits>;
+            if (kSmemSize >= 48 * 1024) {
+                C10_CUDA_CHECK(cudaFuncSetAttribute(
+                    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                }
+            kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+    });
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_bwd_launch<128, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_bwd_launch<128, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_bwd_launch<128, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kSiluAct_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_channellast_bwd_kernel_traits {
+    // The cache line is 128 bytes, and we try to read 16 bytes per thread.
+    // So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
+    // That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
+    // threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr bool kSiluAct = kSiluAct_;
+    static constexpr int kNThreads = kNThreads_;
+    static_assert(kNThreads % 32 == 0);
+    static constexpr int kNWarps = kNThreads / 32;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kChunkSizeL = kChunkSizeL_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
+    static constexpr int kNEltsPerRow = 128 / kNBytes;
+    static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts;  // Always 8 for now
+    static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
+    static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow;  // Always 4 for now
+    static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
+    static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
+    static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
+    static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
+    static constexpr bool kIsVecLoad = kIsVecLoad_;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    // using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    // using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    // static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
+    //                                            sizeof(typename BlockStoreT::TempStorage)});
+    // static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_channellast_bwd_kernel(ConvParamsBwd params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr bool kSiluAct = Ktraits::kSiluAct;
+    constexpr int kNElts = Ktraits::kNElts;
+    constexpr int kNWarp = Ktraits::kNWarps;
+    constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
+    constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
+    constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
+    constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
+    using input_t = typename Ktraits::input_t;
+    using vec_t = typename Ktraits::vec_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    // Shared memory.
+    __shared__ input_t dout_smem[kChunkSizeL + kWidth - 1][kChunkSizeC + kNElts];
+    __shared__ input_t x_smem[kWidth - 1 + kChunkSizeL + kWidth - 1][kChunkSizeC + kNElts];
+
+    const int tid = threadIdx.x;
+    const int l_idx = tid / kNThreadsPerC;
+    const int c_idx = tid % kNThreadsPerC;
+    const int batch_id = blockIdx.x;
+    const int chunk_l_id = blockIdx.y;
+    const int chunk_c_id = blockIdx.z;
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+        + (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
+        + chunk_c_id * kChunkSizeC * params.weight_c_stride;
+    input_t *dout = reinterpret_cast<input_t *>(params.dout_ptr) + batch_id * params.dout_batch_stride
+        + (chunk_l_id * kChunkSizeL + l_idx) * params.dout_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
+    input_t *dx = reinterpret_cast<input_t *>(params.dx_ptr) + batch_id * params.dx_batch_stride
+        + (chunk_l_id * kChunkSizeL + l_idx) * params.dx_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
+    float *dweight = reinterpret_cast<float *>(params.dweight_ptr)
+        + chunk_c_id * kChunkSizeC * params.dweight_c_stride;
+
+    #pragma unroll
+    for (int l = 0; l < Ktraits::kNLoads; ++l) {
+        input_t dout_vals_load[kNElts] = {0};
+        input_t x_vals_load[kNElts] = {0};
+        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
+            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+            reinterpret_cast<vec_t *>(dout_vals_load)[0] = *reinterpret_cast<vec_t *>(dout + l * kLPerLoad * params.dout_l_stride);
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
+        }
+        reinterpret_cast<vec_t *>(dout_smem[l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(dout_vals_load)[0];
+        reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
+    }
+    // Load the elements from the previous chunk or next chunk that are needed for convolution.
+    if (l_idx < kWidth - 1) {
+        input_t dout_vals_load[kNElts] = {0};
+        input_t x_vals_load[kNElts] = {0};
+        if ((chunk_l_id + 1) * kChunkSizeL + l_idx < params.seqlen
+            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+            reinterpret_cast<vec_t *>(dout_vals_load)[0] = *reinterpret_cast<vec_t *>(dout + kChunkSizeL * params.dout_l_stride);
+        }
+        if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
+            && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
+            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
+        }
+        reinterpret_cast<vec_t *>(dout_smem[kChunkSizeL + l_idx])[c_idx] = reinterpret_cast<vec_t *>(dout_vals_load)[0];
+        reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
+    }
+    // Need to load (kWdith - 1) extra x's on the right to recompute the (kChunkSizeL + kWidth - 1) outputs
+    if constexpr (kSiluAct) {
+        if (l_idx < kWidth - 1) {
+            input_t x_vals_load[kNElts] = {0};
+            if ((chunk_l_id + 1) * kChunkSizeL + l_idx < params.seqlen
+                && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+                reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + kChunkSizeL * params.x_l_stride);
+            }
+            reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + kChunkSizeL + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
+        }
+    }
+
+    __syncthreads();
+
+    constexpr int kLPerThread = std::min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
+    static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
+    constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
+    static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
+    // kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
+    static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
+    static_assert((kLPerThread & (kLPerThread - 1)) == 0);
+    static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
+    static_assert(kNThreadsPerRow <= 32);
+
+    const int row_idx = tid / kNThreadsPerRow;
+    const int col_idx = tid % kNThreadsPerRow;
+
+    float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
+    float weight_vals[kWidth] = {0};
+    if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
+        #pragma unroll
+        for (int w = 0; w < kWidth; ++w) {
+            weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
+        }
+    }
+    float dout_vals[kLPerThread + kWidth - 1];
+    float x_vals[kWidth - 1 + kLPerThread + kWidth - 1];
+    #pragma unroll
+    for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
+        dout_vals[i] = float(dout_smem[col_idx * kLPerThread + i][row_idx]);
+        x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
+    }
+
+    if constexpr (kSiluAct) {  // Recompute the output
+        #pragma unroll
+        for (int i = kWidth - 1 + kLPerThread; i < kWidth - 1 + kLPerThread + kWidth - 1; ++i) {
+            x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
+        }
+        #pragma unroll
+        for (int i = 0; i < kLPerThread + kWidth - 1; ++i) {
+            float out_val = bias_val;
+            #pragma unroll
+            for (int w = 0; w < kWidth; ++w) { out_val += weight_vals[w] * x_vals[i + w]; }
+            float out_val_sigmoid = 1.f / (1.f + expf(-out_val));
+            dout_vals[i] *= out_val_sigmoid * (1 + out_val * (1 - out_val_sigmoid));
+        }
+    }
+
+    float dweight_vals[kWidth] = {0};
+    SumOp<float> sum_op;
+    #pragma unroll
+    for (int w = 0; w < kWidth; ++w) {
+        #pragma unroll
+        for (int i = 0; i < kLPerThread; ++i) { dweight_vals[w] += x_vals[i + w] * dout_vals[i]; }
+        dweight_vals[w] = Allreduce<kNThreadsPerRow>::run(dweight_vals[w], sum_op);
+        if (col_idx == 0 && chunk_c_id * kChunkSizeC + row_idx < params.dim) {
+            atomicAdd(&reinterpret_cast<float *>(dweight)[row_idx * params.dweight_c_stride + w * params.dweight_width_stride], dweight_vals[w]);
+        }
+    }
+
+    if (params.bias_ptr != nullptr) {
+        float dbias_val = 0.f;
+        for (int i = 0; i < kLPerThread; ++i) { dbias_val += dout_vals[i]; }
+        dbias_val = Allreduce<kNThreadsPerRow>::run(dbias_val, sum_op);
+        if (col_idx == 0 && chunk_c_id * kChunkSizeC + row_idx < params.dim) {
+            atomicAdd(&reinterpret_cast<float *>(params.dbias_ptr)[chunk_c_id * kChunkSizeC + row_idx], dbias_val);
+        }
+    }
+
+    float dx_vals[kLPerThread] = {0};
+    #pragma unroll
+    for (int i = 0; i < kLPerThread; ++i) {
+        #pragma unroll
+        for (int w = 0; w < kWidth; ++w) { dx_vals[i] += weight_vals[kWidth - 1 - w] * dout_vals[i + w]; }
+    }
+    // Since kNThreadsPerRow is a power of 2 and <= 32, we only need syncwarp and not syncthreads.
+    __syncwarp();
+    #pragma unroll
+    for (int i = 0; i < kLPerThread; ++i) { x_smem[col_idx * kLPerThread + i][row_idx] = dx_vals[i]; }
+    __syncthreads();
+
+    #pragma unroll
+    for (int l = 0; l < Ktraits::kNLoads; ++l) {
+        input_t dx_vals_store[kNElts];
+        reinterpret_cast<vec_t *>(dx_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l * kLPerLoad + l_idx])[c_idx];
+        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
+            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+            *reinterpret_cast<vec_t *>(dx + l * kLPerLoad * params.dx_l_stride) = reinterpret_cast<vec_t *>(dx_vals_store)[0];
+        }
+    }
+
+}
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_channellast_bwd_launch(ConvParamsBwd &params, cudaStream_t stream) {
+    BOOL_SWITCH(params.silu_activation, kSiluAct, [&] {
+        using Ktraits = Causal_conv1d_channellast_bwd_kernel_traits<kNThreads, kWidth, 64, kSiluAct, true, input_t, weight_t>;
+        // constexpr int kSmemSize = Ktraits::kSmemSize;
+        constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
+        constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
+        const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
+        const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
+        dim3 grid(params.batch, n_chunks_L, n_chunks_C);
+        dim3 block(Ktraits::kNThreads);
+        auto kernel = &causal_conv1d_channellast_bwd_kernel<Ktraits>;
+        // if (kSmemSize >= 48 * 1024) {
+        //     C10_CUDA_CHECK(cudaFuncSetAttribute(
+        //         kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+        //     }
+        // kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+        kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_channellast_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_channellast_bwd_launch<128, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_channellast_bwd_launch<128, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_channellast_bwd_launch<128, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+template void causal_conv1d_bwd_cuda<float, float>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<at::Half, float>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<at::BFloat16, float>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<float, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<at::Half, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<at::BFloat16, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<float, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<at::Half, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_bwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
+
+template void causal_conv1d_channellast_bwd_cuda<float, float>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<at::Half, float>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<at::BFloat16, float>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<float, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<at::Half, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<at::BFloat16, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<float, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<at::Half, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
+template void causal_conv1d_channellast_bwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/causal-conv1d/csrc/causal_conv1d_common.h b/causal-conv1d/csrc/causal_conv1d_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dd6a333b52163986c085f71475709706ce8f9c3
--- /dev/null
+++ b/causal-conv1d/csrc/causal_conv1d_common.h
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int BYTES> struct BytesToType {};
+
+template<> struct BytesToType<16> {
+    using Type = uint4;
+    static_assert(sizeof(Type) == 16);
+};
+
+template<> struct BytesToType<8> {
+    using Type = uint64_t;
+    static_assert(sizeof(Type) == 8);
+};
+
+template<> struct BytesToType<4> {
+    using Type = uint32_t;
+    static_assert(sizeof(Type) == 4);
+};
+
+template<> struct BytesToType<2> {
+    using Type = uint16_t;
+    static_assert(sizeof(Type) == 2);
+};
+
+template<> struct BytesToType<1> {
+    using Type = uint8_t;
+    static_assert(sizeof(Type) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct SumOp {
+__device__ inline T operator()(T const & x, T const & y) { return x + y; }
+};
+
+template<int THREADS>
+struct Allreduce {
+    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+    template<typename T, typename Operator>
+    static __device__ inline T run(T x, Operator &op) {
+        constexpr int OFFSET = THREADS / 2;
+        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+        return Allreduce<OFFSET>::run(x, op);
+    }
+};
+
+template<>
+struct Allreduce<2> {
+template<typename T, typename Operator>
+static __device__ inline T run(T x, Operator &op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+}
+};
diff --git a/causal-conv1d/csrc/causal_conv1d_fwd.cu b/causal-conv1d/csrc/causal_conv1d_fwd.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74a1459f88a87ef427075a25e5081899e382efc0
--- /dev/null
+++ b/causal-conv1d/csrc/causal_conv1d_fwd.cu
@@ -0,0 +1,350 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+
+#include "causal_conv1d.h"
+#include "causal_conv1d_common.h"
+#include "static_switch.h"
+
+template<int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_fwd_kernel_traits {
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
+    static_assert(kWidth <= kNElts);
+    static constexpr bool kIsVecLoad = kIsVecLoad_;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
+    static constexpr int kSmemIOSize = kIsVecLoad
+        ? 0
+        : std::max({sizeof(typename BlockLoadT::TempStorage), sizeof(typename BlockStoreT::TempStorage)});
+    static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts;
+    static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize;
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_fwd_kernel(ConvParamsBase params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNElts = Ktraits::kNElts;
+    constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
+    using input_t = typename Ktraits::input_t;
+    using vec_t = typename Ktraits::vec_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
+    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
+
+    const int tidx = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    const int channel_id = blockIdx.y;
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+        + channel_id * params.x_c_stride;
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+        + channel_id * params.out_c_stride;
+    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
+
+    // Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
+    if (tidx == 0) {
+        input_t zeros[kNElts] = {0};
+        smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(zeros)[0];
+    }
+
+    float weight_vals[kWidth];
+    #pragma unroll
+    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
+
+    constexpr int kChunkSize = kNThreads * kNElts;
+    const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
+    for (int chunk = 0; chunk < n_chunks; ++chunk) {
+        input_t x_vals_load[2 * kNElts] = {0};
+        if constexpr(kIsVecLoad) {
+            Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            __syncthreads();
+            Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize);
+        }
+        x += kChunkSize;
+        __syncthreads();
+        // Thread kNThreads - 1 don't write yet, so that thread 0 can read
+        // the last elements of the previous chunk.
+        if (tidx < kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
+        __syncthreads();
+        reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[tidx > 0 ? tidx - 1 : kNThreads - 1];
+        __syncthreads();
+        // Now thread kNThreads - 1 can write the last elements of the current chunk.
+        if (tidx == kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
+
+        float x_vals[2 * kNElts];
+        #pragma unroll
+        for (int i = 0; i < 2 * kNElts; ++i) { x_vals[i] = float(x_vals_load[i]); }
+
+        float out_vals[kNElts];
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) {
+            out_vals[i] = bias_val;
+            #pragma unroll
+            for (int w = 0; w < kWidth; ++w) {
+                out_vals[i] += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
+            }
+        }
+
+        if (params.silu_activation) {
+            #pragma unroll
+            for (int i = 0; i < kNElts; ++i) {
+                out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i]));
+            }
+        }
+
+        input_t out_vals_store[kNElts];
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
+        if constexpr(kIsVecLoad) {
+            Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, params.seqlen - chunk * kChunkSize);
+        }
+        out += kChunkSize;
+    }
+}
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
+    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
+    BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
+        using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
+        constexpr int kSmemSize = Ktraits::kSmemSize;
+        dim3 grid(params.batch, params.dim);
+        auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
+        if (kSmemSize >= 48 * 1024) {
+            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+            }
+        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_channellast_fwd_kernel_traits {
+    // The cache line is 128 bytes, and we try to read 16 bytes per thread.
+    // So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
+    // That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
+    // threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static_assert(kNThreads % 32 == 0);
+    static constexpr int kNWarps = kNThreads / 32;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kChunkSizeL = kChunkSizeL_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
+    static constexpr int kNEltsPerRow = 128 / kNBytes;
+    static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts;  // Always 8 for now
+    static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
+    static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow;  // Always 4 for now
+    static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
+    static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
+    static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
+    static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
+    static constexpr bool kIsVecLoad = kIsVecLoad_;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    // using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    // using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    // static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
+    //                                            sizeof(typename BlockStoreT::TempStorage)});
+    // static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_channellast_fwd_kernel(ConvParamsBase params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNElts = Ktraits::kNElts;
+    constexpr int kNWarp = Ktraits::kNWarps;
+    constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
+    constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
+    constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
+    constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
+    using input_t = typename Ktraits::input_t;
+    using vec_t = typename Ktraits::vec_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    // Shared memory.
+    __shared__ input_t x_smem[kWidth - 1 + kChunkSizeL][kChunkSizeC + kNElts];
+
+    const int tid = threadIdx.x;
+    const int l_idx = tid / kNThreadsPerC;
+    const int c_idx = tid % kNThreadsPerC;
+    const int batch_id = blockIdx.x;
+    const int chunk_l_id = blockIdx.y;
+    const int chunk_c_id = blockIdx.z;
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+        + (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
+        + chunk_c_id * kChunkSizeC * params.weight_c_stride;
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+        + (chunk_l_id * kChunkSizeL + l_idx) * params.out_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
+
+    #pragma unroll
+    for (int l = 0; l < Ktraits::kNLoads; ++l) {
+        input_t x_vals_load[kNElts] = {0};
+        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
+            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
+        }
+        reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
+    }
+    // Load the elements from the previous chunk that are needed for convolution.
+    if (l_idx < kWidth - 1) {
+        input_t x_vals_load[kNElts] = {0};
+        if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
+            && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
+            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
+        }
+        reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
+    }
+
+    __syncthreads();
+
+    constexpr int kLPerThread = std::min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
+    static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
+    constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
+    static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
+    // kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
+    static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
+    static_assert((kLPerThread & (kLPerThread - 1)) == 0);
+    static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
+    static_assert(kNThreadsPerRow <= 32);
+
+    const int row_idx = tid / kNThreadsPerRow;
+    const int col_idx = tid % kNThreadsPerRow;
+
+    float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
+    float weight_vals[kWidth] = {0};
+    if (chunk_c_id + kChunkSizeC + row_idx < params.dim) {
+        #pragma unroll
+        for (int w = 0; w < kWidth; ++w) {
+            weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
+        }
+    }
+    float x_vals[kWidth - 1 + kLPerThread];
+    #pragma unroll
+    for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
+        x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
+    }
+
+    float out_vals[kLPerThread];
+    #pragma unroll
+    for (int i = 0; i < kLPerThread; ++i) {
+        out_vals[i] = bias_val;
+        #pragma unroll
+        for (int w = 0; w < kWidth; ++w) { out_vals[i] += weight_vals[w] * x_vals[i + w]; }
+        if (params.silu_activation) {out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i])); }
+    }
+
+    // Since kNThreadsPerRow is a power of 2 and <= 32, we only need syncwarp and not syncthreads.
+    __syncwarp();
+    #pragma unroll
+    for (int i = 0; i < kLPerThread; ++i) { x_smem[col_idx * kLPerThread + i][row_idx] = out_vals[i]; }
+    __syncthreads();
+
+    #pragma unroll
+    for (int l = 0; l < Ktraits::kNLoads; ++l) {
+        input_t out_vals_store[kNElts];
+        reinterpret_cast<vec_t *>(out_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l * kLPerLoad + l_idx])[c_idx];
+        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
+            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
+            *reinterpret_cast<vec_t *>(out + l * kLPerLoad * params.out_l_stride) = reinterpret_cast<vec_t *>(out_vals_store)[0];
+        }
+    }
+
+}
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_channellast_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
+    using Ktraits = Causal_conv1d_channellast_fwd_kernel_traits<kNThreads, kWidth, 64, true, input_t, weight_t>;
+    // constexpr int kSmemSize = Ktraits::kSmemSize;
+    constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
+    constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
+    const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
+    const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
+    // printf("n_chunks_L: %d, n_chunks_C: %d\n", n_chunks_L, n_chunks_C);
+    dim3 grid(params.batch, n_chunks_L, n_chunks_C);
+    dim3 block(Ktraits::kNThreads);
+    auto kernel = &causal_conv1d_channellast_fwd_kernel<Ktraits>;
+    // if (kSmemSize >= 48 * 1024) {
+    //     C10_CUDA_CHECK(cudaFuncSetAttribute(
+    //         kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+    //     }
+    // kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_channellast_fwd_launch<128, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_channellast_fwd_launch<128, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_channellast_fwd_launch<128, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::Half, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::BFloat16, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<float, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::BFloat16, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<float, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::Half, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+
+template void causal_conv1d_channellast_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<at::Half, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<float, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<float, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<at::Half, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/causal-conv1d/csrc/causal_conv1d_update.cu b/causal-conv1d/csrc/causal_conv1d_update.cu
new file mode 100644
index 0000000000000000000000000000000000000000..713e0ac883853491f9bdb0015b578657c228c1e7
--- /dev/null
+++ b/causal-conv1d/csrc/causal_conv1d_update.cu
@@ -0,0 +1,96 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+
+#include "causal_conv1d.h"
+#include "causal_conv1d_common.h"
+#include "static_switch.h"
+
+template<int kNThreads_, int kWidth_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_update_kernel_traits {
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_update_kernel(ConvParamsBase params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    const int tidx = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    const int channel_id = blockIdx.y * kNThreads + tidx;
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+        + channel_id * params.x_c_stride;
+    input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr) + batch_id * params.conv_state_batch_stride
+        + channel_id * params.conv_state_c_stride;
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+        + channel_id * params.out_c_stride;
+    float bias_val = params.bias_ptr == nullptr || channel_id >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
+
+    float weight_vals[kWidth] = {0};
+    if (channel_id < params.dim) {
+        #pragma unroll
+        for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
+    }
+
+    float x_vals[kWidth] = {0};
+    if (channel_id < params.dim) {
+        #pragma unroll
+        for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = float(conv_state[(i + 1) * params.conv_state_l_stride]); }
+        x_vals[kWidth - 1] = float(x[0]);
+        #pragma unroll
+        for (int i = 0; i < kWidth; ++i) { conv_state[i * params.conv_state_l_stride] = input_t(x_vals[i]); }
+    }
+
+    float out_val = bias_val;
+    #pragma unroll
+    for (int i = 0; i < kWidth; ++i) { out_val += weight_vals[i] * x_vals[i]; }
+    if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
+    if (channel_id < params.dim) { out[0] = input_t(out_val); }
+}
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) {
+    using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
+    dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
+    auto kernel = &causal_conv1d_update_kernel<Ktraits>;
+    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_update_launch<64, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_update_launch<64, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_update_launch<64, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+template void causal_conv1d_update_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::Half, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::BFloat16, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<float, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::BFloat16, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<float, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::Half, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/causal-conv1d/csrc/static_switch.h b/causal-conv1d/csrc/static_switch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f4ad3eb62235443d15c454b6691c2ec63645219
--- /dev/null
+++ b/causal-conv1d/csrc/static_switch.h
@@ -0,0 +1,25 @@
+// Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
+    [&] {                                                                            \
+        if (COND) {                                                                  \
+            static constexpr bool CONST_NAME = true;                                 \
+            return __VA_ARGS__();                                                    \
+        } else {                                                                     \
+            static constexpr bool CONST_NAME = false;                                \
+            return __VA_ARGS__();                                                    \
+        }                                                                            \
+    }()
diff --git a/causal-conv1d/setup.py b/causal-conv1d/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..12e36bf988215a4c536278026e6f4401e66534da
--- /dev/null
+++ b/causal-conv1d/setup.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2023, Tri Dao.
+import sys
+import warnings
+import os
+import re
+import ast
+from pathlib import Path
+from packaging.version import parse, Version
+import platform
+
+from setuptools import setup, find_packages
+import subprocess
+
+import urllib.request
+import urllib.error
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
+import torch
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+    CUDAExtension,
+    CUDA_HOME,
+)
+
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+
+PACKAGE_NAME = "causal_conv1d"
+
+BASE_WHEEL_URL = "https://github.com/Dao-AILab/causal-conv1d/releases/download/{tag_name}/{wheel_name}"
+
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("CAUSAL_CONV1D_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("CAUSAL_CONV1D_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("CAUSAL_CONV1D_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+
+
+def get_platform():
+    """
+    Returns the platform name as used in wheel filenames.
+    """
+    if sys.platform.startswith("linux"):
+        return "linux_x86_64"
+    elif sys.platform == "darwin":
+        mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
+        return f"macosx_{mac_version}_x86_64"
+    elif sys.platform == "win32":
+        return "win_amd64"
+    else:
+        raise ValueError("Unsupported platform: {}".format(sys.platform))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+
+    return raw_output, bare_metal_version
+
+
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+
+
+def append_nvcc_threads(nvcc_extra_args):
+    return nvcc_extra_args + ["--threads", "4"]
+
+
+cmdclass = {}
+ext_modules = []
+
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+    check_if_cuda_home_none("causal_conv1d")
+    # Check, if CUDA11 is installed for compute capability 8.0
+    cc_flag = []
+    if CUDA_HOME is not None:
+        _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+        if bare_metal_version < Version("11.6"):
+            raise RuntimeError(
+                "causal_conv1d is only supported on CUDA 11.6 and above.  "
+                "Note: make sure nvcc has a supported version by running nvcc -V."
+            )
+
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_70,code=sm_70")
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_80,code=sm_80")
+    if bare_metal_version >= Version("11.8"):
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_90,code=sm_90")
+
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+
+    ext_modules.append(
+        CUDAExtension(
+            name="causal_conv1d_cuda",
+            sources=[
+                "csrc/causal_conv1d.cpp",
+                "csrc/causal_conv1d_fwd.cu",
+                "csrc/causal_conv1d_bwd.cu",
+                "csrc/causal_conv1d_update.cu",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3"],
+                "nvcc": append_nvcc_threads(
+                    [
+                        "-O3",
+                        "-U__CUDA_NO_HALF_OPERATORS__",
+                        "-U__CUDA_NO_HALF_CONVERSIONS__",
+                        "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                        "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+                        "--expt-relaxed-constexpr",
+                        "--expt-extended-lambda",
+                        "--use_fast_math",
+                        "--ptxas-options=-v",
+                        "-lineinfo",
+                    ]
+                    + cc_flag
+                ),
+            },
+            include_dirs=[this_dir],
+        )
+    )
+
+
+def get_package_version():
+    with open(Path(this_dir) / "causal_conv1d" / "__init__.py", "r") as f:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+    public_version = ast.literal_eval(version_match.group(1))
+    local_version = os.environ.get("CAUSAL_CONV1D_LOCAL_VERSION")
+    if local_version:
+        return f"{public_version}+{local_version}"
+    else:
+        return str(public_version)
+
+
+def get_wheel_url():
+    # Determine the version numbers that will be used to determine the correct wheel
+    # We're using the CUDA version used to build torch, not the one currently installed
+    # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
+    torch_cuda_version = parse(torch.version.cuda)
+    torch_version_raw = parse(torch.__version__)
+    # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.2
+    # to save CI time. Minor versions should be compatible.
+    torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.2")
+    python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+    platform_name = get_platform()
+    causal_conv1d_version = get_package_version()
+    # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
+    cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
+    torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}"
+    cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()
+
+    # Determine wheel URL based on CUDA version, torch version, python version and OS
+    wheel_filename = f"{PACKAGE_NAME}-{causal_conv1d_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl"
+    wheel_url = BASE_WHEEL_URL.format(
+        tag_name=f"v{causal_conv1d_version}", wheel_name=wheel_filename
+    )
+    return wheel_url, wheel_filename
+
+
+class CachedWheelsCommand(_bdist_wheel):
+    """
+    The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
+    find an existing wheel (which is currently the case for all installs). We use
+    the environment parameters to detect whether there is already a pre-built version of a compatible
+    wheel available and short-circuits the standard full build pipeline.
+    """
+
+    def run(self):
+        if FORCE_BUILD:
+            return super().run()
+
+        wheel_url, wheel_filename = get_wheel_url()
+        print("Guessing wheel URL: ", wheel_url)
+        try:
+            urllib.request.urlretrieve(wheel_url, wheel_filename)
+
+            # Make the archive
+            # Lifted from the root wheel processing command
+            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
+            if not os.path.exists(self.dist_dir):
+                os.makedirs(self.dist_dir)
+
+            impl_tag, abi_tag, plat_tag = self.get_tag()
+            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
+
+            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
+            print("Raw wheel path", wheel_path)
+            os.rename(wheel_filename, wheel_path)
+        except urllib.error.HTTPError:
+            print("Precompiled wheel not found. Building from source...")
+            # If the wheel could not be downloaded, build from source
+            super().run()
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=get_package_version(),
+    packages=find_packages(
+        exclude=(
+            "build",
+            "csrc",
+            "include",
+            "tests",
+            "dist",
+            "docs",
+            "benchmarks",
+            "causal_conv1d.egg-info",
+        )
+    ),
+    author="Tri Dao",
+    author_email="tri@tridao.me",
+    description="Causal depthwise conv1d in CUDA, with a PyTorch interface",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/Dao-AILab/causal-conv1d",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: Unix",
+    ],
+    ext_modules=ext_modules,
+    cmdclass={"bdist_wheel": CachedWheelsCommand, "build_ext": BuildExtension}
+    if ext_modules
+    else {
+        "bdist_wheel": CachedWheelsCommand,
+    },
+    python_requires=">=3.7",
+    install_requires=[
+        "torch",
+        "packaging",
+        "ninja",
+    ],
+)
diff --git a/causal-conv1d/tests/test_causal_conv1d.py b/causal-conv1d/tests/test_causal_conv1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e5985cfb0582e6656afb1d8b5c1de78f24f4276
--- /dev/null
+++ b/causal-conv1d/tests/test_causal_conv1d.py
@@ -0,0 +1,173 @@
+# Copyright (C) 2023, Tri Dao.
+
+import math
+
+import torch
+import pytest
+
+from einops import rearrange
+
+from causal_conv1d.causal_conv1d_interface import causal_conv1d_fn, causal_conv1d_ref
+from causal_conv1d.causal_conv1d_interface import causal_conv1d_update, causal_conv1d_update_ref
+
+
+@pytest.mark.parametrize("channel_last", [False, True])
+# @pytest.mark.parametrize('channel_last', [True])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+# @pytest.mark.parametrize('silu_activation', [True])
+@pytest.mark.parametrize("has_bias", [False, True])
+# @pytest.mark.parametrize('has_bias', [True])
+@pytest.mark.parametrize("width", [2, 3, 4])
+# @pytest.mark.parametrize('width', [2])
+@pytest.mark.parametrize(
+    "seqlen", [8, 16, 32, 64, 128, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
+)
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
+# @pytest.mark.parametrize('seqlen', [128])
+def test_causal_conv1d(seqlen, width, has_bias, silu_activation, itype, channel_last):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    # batch_size = 1
+    dim = 4096 + 32  # Try dim not divisible by 64
+    # dim = 64
+    if not channel_last:
+        x = torch.randn(batch_size, 4096 + dim + 64, seqlen, device=device, dtype=itype)[:, 4096:4096 + dim, :].requires_grad_()
+    else:
+        x = rearrange(
+            torch.randn(batch_size, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
+        ).requires_grad_()
+    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        bias = None
+    x_ref = x.detach().clone().requires_grad_()
+    weight_ref = weight.detach().clone().requires_grad_()
+    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_fn(x, weight, bias, activation=activation)
+    out_ref = causal_conv1d_ref(x_ref, weight_ref, bias_ref, activation=activation)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    g = torch.randn_like(out)
+    out_ref.backward(g)
+    out.backward(g)
+
+    print(f"dx max diff: {(x.grad - x_ref.grad).abs().max().item()}")
+    print(f"dweight max diff: {(weight.grad - weight_ref.grad).abs().max().item()}")
+    if has_bias:
+        print(f"dbias max diff: {(bias.grad - bias_ref.grad).abs().max().item()}")
+
+    assert torch.allclose(x.grad, x_ref.grad.to(dtype=itype), rtol=rtol, atol=atol)
+    assert torch.allclose(weight.grad, weight_ref.grad, rtol=rtolw, atol=atolw)
+    if has_bias:
+        assert torch.allclose(bias.grad, bias_ref.grad, rtol=rtolw, atol=atolw)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+# @pytest.mark.parametrize('silu_activation', [False])
+@pytest.mark.parametrize("has_bias", [False, True])
+# @pytest.mark.parametrize('has_bias', [True])
+@pytest.mark.parametrize("width", [2, 3, 4])
+# @pytest.mark.parametrize('width', [2])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+# @pytest.mark.parametrize("dim", [2048])
+def test_causal_conv1d_update(dim, width, has_bias, silu_activation, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    # batch_size = 1
+    # dim = 64
+    x = torch.randn(batch_size, dim, device=device, dtype=itype)
+    conv_state = torch.randn(batch_size, dim, width, device=device, dtype=itype)
+    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        bias = None
+    conv_state_ref = conv_state.detach().clone()
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation)
+    out_ref = causal_conv1d_update_ref(x, conv_state_ref, weight, bias, activation=activation)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.equal(conv_state, conv_state_ref)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+# @pytest.mark.parametrize("channel_last", [False, True])
+@pytest.mark.parametrize('channel_last', [True])
+# @pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.bfloat16])
+# @pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize('silu_activation', [True])
+# @pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize('has_bias', [True])
+# @pytest.mark.parametrize("width", [2, 3, 4])
+@pytest.mark.parametrize('width', [4])
+@pytest.mark.parametrize(
+    # "seqlen", [8, 16, 32, 64, 128, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
+    "seqlen", [2048]
+)
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
+# @pytest.mark.parametrize('seqlen', [128])
+def test_causal_conv1d_race_condition(seqlen, width, has_bias, silu_activation, itype, channel_last):
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    # batch_size = 1
+    dim = 4096 + 32  # Try dim not divisible by 64
+    # dim = 64
+    if not channel_last:
+        x = torch.randn(batch_size, 4096 + dim + 64, seqlen, device=device, dtype=itype)[:, 4096:4096 + dim, :].requires_grad_()
+    else:
+        x = rearrange(
+            torch.randn(batch_size, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
+        ).requires_grad_()
+    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        bias = None
+    activation = None if not silu_activation else "silu"
+    out0 = causal_conv1d_fn(x, weight, bias, activation=activation)
+    g = torch.randn_like(out0)
+    dx0, dw0, db0 = torch.autograd.grad(out0, (x, weight, bias), g)
+    dw_atol = 1e-4
+    db_atol = 1e-4
+
+    for i in range(10000):
+        out = causal_conv1d_fn(x, weight, bias, activation=activation)
+        dx, dw, db = torch.autograd.grad(out, (x, weight, bias), g)
+        dw_equal = torch.allclose(dw, dw0, atol=dw_atol)
+        # if not dw_equal:
+        #     breakpoint()
+        if has_bias:
+            db_equal = torch.allclose(db, db0, atol=db_atol)
+            # if not db_equal:
+            #     breakpoint()
+        assert torch.equal(out, out0)
+        assert torch.equal(dx, dx0)
+        assert dw_equal
+        if has_bias:
+            assert dw_equal
diff --git a/imagenet_class_index.py b/imagenet_class_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..5407d1471197fd9bfa466c47d8dfb6683cb9f551
--- /dev/null
+++ b/imagenet_class_index.py
@@ -0,0 +1,1002 @@
+imagenet_classnames = {
+    "0": ["n01440764", "tench"], 
+    "1": ["n01443537", "goldfish"], 
+    "2": ["n01484850", "great_white_shark"], 
+    "3": ["n01491361", "tiger_shark"], 
+    "4": ["n01494475", "hammerhead"], 
+    "5": ["n01496331", "electric_ray"], 
+    "6": ["n01498041", "stingray"], 
+    "7": ["n01514668", "cock"], 
+    "8": ["n01514859", "hen"], 
+    "9": ["n01518878", "ostrich"], 
+    "10": ["n01530575", "brambling"], 
+    "11": ["n01531178", "goldfinch"], 
+    "12": ["n01532829", "house_finch"], 
+    "13": ["n01534433", "junco"], 
+    "14": ["n01537544", "indigo_bunting"], 
+    "15": ["n01558993", "robin"], 
+    "16": ["n01560419", "bulbul"], 
+    "17": ["n01580077", "jay"], 
+    "18": ["n01582220", "magpie"], 
+    "19": ["n01592084", "chickadee"], 
+    "20": ["n01601694", "water_ouzel"], 
+    "21": ["n01608432", "kite"], 
+    "22": ["n01614925", "bald_eagle"], 
+    "23": ["n01616318", "vulture"], 
+    "24": ["n01622779", "great_grey_owl"], 
+    "25": ["n01629819", "European_fire_salamander"], 
+    "26": ["n01630670", "common_newt"], 
+    "27": ["n01631663", "eft"], 
+    "28": ["n01632458", "spotted_salamander"], 
+    "29": ["n01632777", "axolotl"], 
+    "30": ["n01641577", "bullfrog"], 
+    "31": ["n01644373", "tree_frog"], 
+    "32": ["n01644900", "tailed_frog"], 
+    "33": ["n01664065", "loggerhead"], 
+    "34": ["n01665541", "leatherback_turtle"], 
+    "35": ["n01667114", "mud_turtle"], 
+    "36": ["n01667778", "terrapin"], 
+    "37": ["n01669191", "box_turtle"], 
+    "38": ["n01675722", "banded_gecko"], 
+    "39": ["n01677366", "common_iguana"], 
+    "40": ["n01682714", "American_chameleon"], 
+    "41": ["n01685808", "whiptail"], 
+    "42": ["n01687978", "agama"], 
+    "43": ["n01688243", "frilled_lizard"], 
+    "44": ["n01689811", "alligator_lizard"], 
+    "45": ["n01692333", "Gila_monster"], 
+    "46": ["n01693334", "green_lizard"], 
+    "47": ["n01694178", "African_chameleon"], 
+    "48": ["n01695060", "Komodo_dragon"], 
+    "49": ["n01697457", "African_crocodile"], 
+    "50": ["n01698640", "American_alligator"], 
+    "51": ["n01704323", "triceratops"], 
+    "52": ["n01728572", "thunder_snake"], 
+    "53": ["n01728920", "ringneck_snake"], 
+    "54": ["n01729322", "hognose_snake"], 
+    "55": ["n01729977", "green_snake"], 
+    "56": ["n01734418", "king_snake"], 
+    "57": ["n01735189", "garter_snake"], 
+    "58": ["n01737021", "water_snake"], 
+    "59": ["n01739381", "vine_snake"], 
+    "60": ["n01740131", "night_snake"], 
+    "61": ["n01742172", "boa_constrictor"], 
+    "62": ["n01744401", "rock_python"], 
+    "63": ["n01748264", "Indian_cobra"], 
+    "64": ["n01749939", "green_mamba"], 
+    "65": ["n01751748", "sea_snake"], 
+    "66": ["n01753488", "horned_viper"], 
+    "67": ["n01755581", "diamondback"], 
+    "68": ["n01756291", "sidewinder"], 
+    "69": ["n01768244", "trilobite"], 
+    "70": ["n01770081", "harvestman"], 
+    "71": ["n01770393", "scorpion"], 
+    "72": ["n01773157", "black_and_gold_garden_spider"], 
+    "73": ["n01773549", "barn_spider"], 
+    "74": ["n01773797", "garden_spider"], 
+    "75": ["n01774384", "black_widow"], 
+    "76": ["n01774750", "tarantula"], 
+    "77": ["n01775062", "wolf_spider"], 
+    "78": ["n01776313", "tick"], 
+    "79": ["n01784675", "centipede"], 
+    "80": ["n01795545", "black_grouse"], 
+    "81": ["n01796340", "ptarmigan"], 
+    "82": ["n01797886", "ruffed_grouse"], 
+    "83": ["n01798484", "prairie_chicken"], 
+    "84": ["n01806143", "peacock"], 
+    "85": ["n01806567", "quail"], 
+    "86": ["n01807496", "partridge"], 
+    "87": ["n01817953", "African_grey"], 
+    "88": ["n01818515", "macaw"], 
+    "89": ["n01819313", "sulphur-crested_cockatoo"], 
+    "90": ["n01820546", "lorikeet"], 
+    "91": ["n01824575", "coucal"], 
+    "92": ["n01828970", "bee_eater"], 
+    "93": ["n01829413", "hornbill"], 
+    "94": ["n01833805", "hummingbird"], 
+    "95": ["n01843065", "jacamar"], 
+    "96": ["n01843383", "toucan"], 
+    "97": ["n01847000", "drake"], 
+    "98": ["n01855032", "red-breasted_merganser"], 
+    "99": ["n01855672", "goose"], 
+    "100": ["n01860187", "black_swan"], 
+    "101": ["n01871265", "tusker"], 
+    "102": ["n01872401", "echidna"], 
+    "103": ["n01873310", "platypus"], 
+    "104": ["n01877812", "wallaby"], 
+    "105": ["n01882714", "koala"], 
+    "106": ["n01883070", "wombat"], 
+    "107": ["n01910747", "jellyfish"], 
+    "108": ["n01914609", "sea_anemone"], 
+    "109": ["n01917289", "brain_coral"], 
+    "110": ["n01924916", "flatworm"], 
+    "111": ["n01930112", "nematode"], 
+    "112": ["n01943899", "conch"], 
+    "113": ["n01944390", "snail"], 
+    "114": ["n01945685", "slug"], 
+    "115": ["n01950731", "sea_slug"], 
+    "116": ["n01955084", "chiton"], 
+    "117": ["n01968897", "chambered_nautilus"], 
+    "118": ["n01978287", "Dungeness_crab"], 
+    "119": ["n01978455", "rock_crab"], 
+    "120": ["n01980166", "fiddler_crab"], 
+    "121": ["n01981276", "king_crab"], 
+    "122": ["n01983481", "American_lobster"], 
+    "123": ["n01984695", "spiny_lobster"], 
+    "124": ["n01985128", "crayfish"], 
+    "125": ["n01986214", "hermit_crab"], 
+    "126": ["n01990800", "isopod"], 
+    "127": ["n02002556", "white_stork"], 
+    "128": ["n02002724", "black_stork"], 
+    "129": ["n02006656", "spoonbill"], 
+    "130": ["n02007558", "flamingo"], 
+    "131": ["n02009229", "little_blue_heron"], 
+    "132": ["n02009912", "American_egret"], 
+    "133": ["n02011460", "bittern"], 
+    "134": ["n02012849", "crane"], 
+    "135": ["n02013706", "limpkin"], 
+    "136": ["n02017213", "European_gallinule"], 
+    "137": ["n02018207", "American_coot"], 
+    "138": ["n02018795", "bustard"], 
+    "139": ["n02025239", "ruddy_turnstone"], 
+    "140": ["n02027492", "red-backed_sandpiper"], 
+    "141": ["n02028035", "redshank"], 
+    "142": ["n02033041", "dowitcher"], 
+    "143": ["n02037110", "oystercatcher"], 
+    "144": ["n02051845", "pelican"], 
+    "145": ["n02056570", "king_penguin"], 
+    "146": ["n02058221", "albatross"], 
+    "147": ["n02066245", "grey_whale"], 
+    "148": ["n02071294", "killer_whale"], 
+    "149": ["n02074367", "dugong"], 
+    "150": ["n02077923", "sea_lion"], 
+    "151": ["n02085620", "Chihuahua"], 
+    "152": ["n02085782", "Japanese_spaniel"], 
+    "153": ["n02085936", "Maltese_dog"], 
+    "154": ["n02086079", "Pekinese"], 
+    "155": ["n02086240", "Shih-Tzu"], 
+    "156": ["n02086646", "Blenheim_spaniel"], 
+    "157": ["n02086910", "papillon"], 
+    "158": ["n02087046", "toy_terrier"], 
+    "159": ["n02087394", "Rhodesian_ridgeback"], 
+    "160": ["n02088094", "Afghan_hound"], 
+    "161": ["n02088238", "basset"], 
+    "162": ["n02088364", "beagle"], 
+    "163": ["n02088466", "bloodhound"], 
+    "164": ["n02088632", "bluetick"], 
+    "165": ["n02089078", "black-and-tan_coonhound"], 
+    "166": ["n02089867", "Walker_hound"], 
+    "167": ["n02089973", "English_foxhound"], 
+    "168": ["n02090379", "redbone"], 
+    "169": ["n02090622", "borzoi"], 
+    "170": ["n02090721", "Irish_wolfhound"], 
+    "171": ["n02091032", "Italian_greyhound"], 
+    "172": ["n02091134", "whippet"], 
+    "173": ["n02091244", "Ibizan_hound"], 
+    "174": ["n02091467", "Norwegian_elkhound"], 
+    "175": ["n02091635", "otterhound"], 
+    "176": ["n02091831", "Saluki"], 
+    "177": ["n02092002", "Scottish_deerhound"], 
+    "178": ["n02092339", "Weimaraner"], 
+    "179": ["n02093256", "Staffordshire_bullterrier"], 
+    "180": ["n02093428", "American_Staffordshire_terrier"], 
+    "181": ["n02093647", "Bedlington_terrier"], 
+    "182": ["n02093754", "Border_terrier"], 
+    "183": ["n02093859", "Kerry_blue_terrier"], 
+    "184": ["n02093991", "Irish_terrier"], 
+    "185": ["n02094114", "Norfolk_terrier"], 
+    "186": ["n02094258", "Norwich_terrier"], 
+    "187": ["n02094433", "Yorkshire_terrier"], 
+    "188": ["n02095314", "wire-haired_fox_terrier"], 
+    "189": ["n02095570", "Lakeland_terrier"], 
+    "190": ["n02095889", "Sealyham_terrier"], 
+    "191": ["n02096051", "Airedale"], 
+    "192": ["n02096177", "cairn"], 
+    "193": ["n02096294", "Australian_terrier"], 
+    "194": ["n02096437", "Dandie_Dinmont"], 
+    "195": ["n02096585", "Boston_bull"], 
+    "196": ["n02097047", "miniature_schnauzer"], 
+    "197": ["n02097130", "giant_schnauzer"], 
+    "198": ["n02097209", "standard_schnauzer"], 
+    "199": ["n02097298", "Scotch_terrier"], 
+    "200": ["n02097474", "Tibetan_terrier"], 
+    "201": ["n02097658", "silky_terrier"], 
+    "202": ["n02098105", "soft-coated_wheaten_terrier"], 
+    "203": ["n02098286", "West_Highland_white_terrier"], 
+    "204": ["n02098413", "Lhasa"], 
+    "205": ["n02099267", "flat-coated_retriever"], 
+    "206": ["n02099429", "curly-coated_retriever"], 
+    "207": ["n02099601", "golden_retriever"], 
+    "208": ["n02099712", "Labrador_retriever"], 
+    "209": ["n02099849", "Chesapeake_Bay_retriever"], 
+    "210": ["n02100236", "German_short-haired_pointer"], 
+    "211": ["n02100583", "vizsla"], 
+    "212": ["n02100735", "English_setter"], 
+    "213": ["n02100877", "Irish_setter"], 
+    "214": ["n02101006", "Gordon_setter"], 
+    "215": ["n02101388", "Brittany_spaniel"], 
+    "216": ["n02101556", "clumber"], 
+    "217": ["n02102040", "English_springer"], 
+    "218": ["n02102177", "Welsh_springer_spaniel"], 
+    "219": ["n02102318", "cocker_spaniel"], 
+    "220": ["n02102480", "Sussex_spaniel"], 
+    "221": ["n02102973", "Irish_water_spaniel"], 
+    "222": ["n02104029", "kuvasz"], 
+    "223": ["n02104365", "schipperke"], 
+    "224": ["n02105056", "groenendael"], 
+    "225": ["n02105162", "malinois"], 
+    "226": ["n02105251", "briard"], 
+    "227": ["n02105412", "kelpie"], 
+    "228": ["n02105505", "komondor"], 
+    "229": ["n02105641", "Old_English_sheepdog"], 
+    "230": ["n02105855", "Shetland_sheepdog"], 
+    "231": ["n02106030", "collie"], 
+    "232": ["n02106166", "Border_collie"], 
+    "233": ["n02106382", "Bouvier_des_Flandres"], 
+    "234": ["n02106550", "Rottweiler"], 
+    "235": ["n02106662", "German_shepherd"], 
+    "236": ["n02107142", "Doberman"], 
+    "237": ["n02107312", "miniature_pinscher"], 
+    "238": ["n02107574", "Greater_Swiss_Mountain_dog"], 
+    "239": ["n02107683", "Bernese_mountain_dog"], 
+    "240": ["n02107908", "Appenzeller"], 
+    "241": ["n02108000", "EntleBucher"], 
+    "242": ["n02108089", "boxer"], 
+    "243": ["n02108422", "bull_mastiff"], 
+    "244": ["n02108551", "Tibetan_mastiff"], 
+    "245": ["n02108915", "French_bulldog"], 
+    "246": ["n02109047", "Great_Dane"], 
+    "247": ["n02109525", "Saint_Bernard"], 
+    "248": ["n02109961", "Eskimo_dog"], 
+    "249": ["n02110063", "malamute"], 
+    "250": ["n02110185", "Siberian_husky"], 
+    "251": ["n02110341", "dalmatian"], 
+    "252": ["n02110627", "affenpinscher"], 
+    "253": ["n02110806", "basenji"], 
+    "254": ["n02110958", "pug"], 
+    "255": ["n02111129", "Leonberg"], 
+    "256": ["n02111277", "Newfoundland"], 
+    "257": ["n02111500", "Great_Pyrenees"], 
+    "258": ["n02111889", "Samoyed"], 
+    "259": ["n02112018", "Pomeranian"], 
+    "260": ["n02112137", "chow"], 
+    "261": ["n02112350", "keeshond"], 
+    "262": ["n02112706", "Brabancon_griffon"], 
+    "263": ["n02113023", "Pembroke"], 
+    "264": ["n02113186", "Cardigan"], 
+    "265": ["n02113624", "toy_poodle"], 
+    "266": ["n02113712", "miniature_poodle"], 
+    "267": ["n02113799", "standard_poodle"], 
+    "268": ["n02113978", "Mexican_hairless"], 
+    "269": ["n02114367", "timber_wolf"], 
+    "270": ["n02114548", "white_wolf"], 
+    "271": ["n02114712", "red_wolf"], 
+    "272": ["n02114855", "coyote"], 
+    "273": ["n02115641", "dingo"], 
+    "274": ["n02115913", "dhole"], 
+    "275": ["n02116738", "African_hunting_dog"], 
+    "276": ["n02117135", "hyena"], 
+    "277": ["n02119022", "red_fox"], 
+    "278": ["n02119789", "kit_fox"], 
+    "279": ["n02120079", "Arctic_fox"], 
+    "280": ["n02120505", "grey_fox"], 
+    "281": ["n02123045", "tabby"], 
+    "282": ["n02123159", "tiger_cat"], 
+    "283": ["n02123394", "Persian_cat"], 
+    "284": ["n02123597", "Siamese_cat"], 
+    "285": ["n02124075", "Egyptian_cat"], 
+    "286": ["n02125311", "cougar"], 
+    "287": ["n02127052", "lynx"], 
+    "288": ["n02128385", "leopard"], 
+    "289": ["n02128757", "snow_leopard"], 
+    "290": ["n02128925", "jaguar"], 
+    "291": ["n02129165", "lion"], 
+    "292": ["n02129604", "tiger"], 
+    "293": ["n02130308", "cheetah"], 
+    "294": ["n02132136", "brown_bear"], 
+    "295": ["n02133161", "American_black_bear"], 
+    "296": ["n02134084", "ice_bear"], 
+    "297": ["n02134418", "sloth_bear"], 
+    "298": ["n02137549", "mongoose"], 
+    "299": ["n02138441", "meerkat"], 
+    "300": ["n02165105", "tiger_beetle"], 
+    "301": ["n02165456", "ladybug"], 
+    "302": ["n02167151", "ground_beetle"], 
+    "303": ["n02168699", "long-horned_beetle"], 
+    "304": ["n02169497", "leaf_beetle"], 
+    "305": ["n02172182", "dung_beetle"], 
+    "306": ["n02174001", "rhinoceros_beetle"], 
+    "307": ["n02177972", "weevil"], 
+    "308": ["n02190166", "fly"], 
+    "309": ["n02206856", "bee"], 
+    "310": ["n02219486", "ant"], 
+    "311": ["n02226429", "grasshopper"], 
+    "312": ["n02229544", "cricket"], 
+    "313": ["n02231487", "walking_stick"], 
+    "314": ["n02233338", "cockroach"], 
+    "315": ["n02236044", "mantis"], 
+    "316": ["n02256656", "cicada"], 
+    "317": ["n02259212", "leafhopper"], 
+    "318": ["n02264363", "lacewing"], 
+    "319": ["n02268443", "dragonfly"], 
+    "320": ["n02268853", "damselfly"], 
+    "321": ["n02276258", "admiral"], 
+    "322": ["n02277742", "ringlet"], 
+    "323": ["n02279972", "monarch"], 
+    "324": ["n02280649", "cabbage_butterfly"], 
+    "325": ["n02281406", "sulphur_butterfly"], 
+    "326": ["n02281787", "lycaenid"], 
+    "327": ["n02317335", "starfish"], 
+    "328": ["n02319095", "sea_urchin"], 
+    "329": ["n02321529", "sea_cucumber"], 
+    "330": ["n02325366", "wood_rabbit"], 
+    "331": ["n02326432", "hare"], 
+    "332": ["n02328150", "Angora"], 
+    "333": ["n02342885", "hamster"], 
+    "334": ["n02346627", "porcupine"], 
+    "335": ["n02356798", "fox_squirrel"], 
+    "336": ["n02361337", "marmot"], 
+    "337": ["n02363005", "beaver"], 
+    "338": ["n02364673", "guinea_pig"], 
+    "339": ["n02389026", "sorrel"], 
+    "340": ["n02391049", "zebra"], 
+    "341": ["n02395406", "hog"], 
+    "342": ["n02396427", "wild_boar"], 
+    "343": ["n02397096", "warthog"], 
+    "344": ["n02398521", "hippopotamus"], 
+    "345": ["n02403003", "ox"], 
+    "346": ["n02408429", "water_buffalo"], 
+    "347": ["n02410509", "bison"], 
+    "348": ["n02412080", "ram"], 
+    "349": ["n02415577", "bighorn"], 
+    "350": ["n02417914", "ibex"], 
+    "351": ["n02422106", "hartebeest"], 
+    "352": ["n02422699", "impala"], 
+    "353": ["n02423022", "gazelle"], 
+    "354": ["n02437312", "Arabian_camel"], 
+    "355": ["n02437616", "llama"], 
+    "356": ["n02441942", "weasel"], 
+    "357": ["n02442845", "mink"], 
+    "358": ["n02443114", "polecat"], 
+    "359": ["n02443484", "black-footed_ferret"], 
+    "360": ["n02444819", "otter"], 
+    "361": ["n02445715", "skunk"], 
+    "362": ["n02447366", "badger"], 
+    "363": ["n02454379", "armadillo"], 
+    "364": ["n02457408", "three-toed_sloth"], 
+    "365": ["n02480495", "orangutan"], 
+    "366": ["n02480855", "gorilla"], 
+    "367": ["n02481823", "chimpanzee"], 
+    "368": ["n02483362", "gibbon"], 
+    "369": ["n02483708", "siamang"], 
+    "370": ["n02484975", "guenon"], 
+    "371": ["n02486261", "patas"], 
+    "372": ["n02486410", "baboon"], 
+    "373": ["n02487347", "macaque"], 
+    "374": ["n02488291", "langur"], 
+    "375": ["n02488702", "colobus"], 
+    "376": ["n02489166", "proboscis_monkey"], 
+    "377": ["n02490219", "marmoset"], 
+    "378": ["n02492035", "capuchin"], 
+    "379": ["n02492660", "howler_monkey"], 
+    "380": ["n02493509", "titi"], 
+    "381": ["n02493793", "spider_monkey"], 
+    "382": ["n02494079", "squirrel_monkey"], 
+    "383": ["n02497673", "Madagascar_cat"], 
+    "384": ["n02500267", "indri"], 
+    "385": ["n02504013", "Indian_elephant"], 
+    "386": ["n02504458", "African_elephant"], 
+    "387": ["n02509815", "lesser_panda"], 
+    "388": ["n02510455", "giant_panda"], 
+    "389": ["n02514041", "barracouta"], 
+    "390": ["n02526121", "eel"], 
+    "391": ["n02536864", "coho"], 
+    "392": ["n02606052", "rock_beauty"], 
+    "393": ["n02607072", "anemone_fish"], 
+    "394": ["n02640242", "sturgeon"], 
+    "395": ["n02641379", "gar"], 
+    "396": ["n02643566", "lionfish"], 
+    "397": ["n02655020", "puffer"], 
+    "398": ["n02666196", "abacus"], 
+    "399": ["n02667093", "abaya"], 
+    "400": ["n02669723", "academic_gown"], 
+    "401": ["n02672831", "accordion"], 
+    "402": ["n02676566", "acoustic_guitar"], 
+    "403": ["n02687172", "aircraft_carrier"], 
+    "404": ["n02690373", "airliner"], 
+    "405": ["n02692877", "airship"], 
+    "406": ["n02699494", "altar"], 
+    "407": ["n02701002", "ambulance"], 
+    "408": ["n02704792", "amphibian"], 
+    "409": ["n02708093", "analog_clock"], 
+    "410": ["n02727426", "apiary"], 
+    "411": ["n02730930", "apron"], 
+    "412": ["n02747177", "ashcan"], 
+    "413": ["n02749479", "assault_rifle"], 
+    "414": ["n02769748", "backpack"], 
+    "415": ["n02776631", "bakery"], 
+    "416": ["n02777292", "balance_beam"], 
+    "417": ["n02782093", "balloon"], 
+    "418": ["n02783161", "ballpoint"], 
+    "419": ["n02786058", "Band_Aid"], 
+    "420": ["n02787622", "banjo"], 
+    "421": ["n02788148", "bannister"], 
+    "422": ["n02790996", "barbell"], 
+    "423": ["n02791124", "barber_chair"], 
+    "424": ["n02791270", "barbershop"], 
+    "425": ["n02793495", "barn"], 
+    "426": ["n02794156", "barometer"], 
+    "427": ["n02795169", "barrel"], 
+    "428": ["n02797295", "barrow"], 
+    "429": ["n02799071", "baseball"], 
+    "430": ["n02802426", "basketball"], 
+    "431": ["n02804414", "bassinet"], 
+    "432": ["n02804610", "bassoon"], 
+    "433": ["n02807133", "bathing_cap"], 
+    "434": ["n02808304", "bath_towel"], 
+    "435": ["n02808440", "bathtub"], 
+    "436": ["n02814533", "beach_wagon"], 
+    "437": ["n02814860", "beacon"], 
+    "438": ["n02815834", "beaker"], 
+    "439": ["n02817516", "bearskin"], 
+    "440": ["n02823428", "beer_bottle"], 
+    "441": ["n02823750", "beer_glass"], 
+    "442": ["n02825657", "bell_cote"], 
+    "443": ["n02834397", "bib"], 
+    "444": ["n02835271", "bicycle-built-for-two"], 
+    "445": ["n02837789", "bikini"], 
+    "446": ["n02840245", "binder"], 
+    "447": ["n02841315", "binoculars"], 
+    "448": ["n02843684", "birdhouse"], 
+    "449": ["n02859443", "boathouse"], 
+    "450": ["n02860847", "bobsled"], 
+    "451": ["n02865351", "bolo_tie"], 
+    "452": ["n02869837", "bonnet"], 
+    "453": ["n02870880", "bookcase"], 
+    "454": ["n02871525", "bookshop"], 
+    "455": ["n02877765", "bottlecap"], 
+    "456": ["n02879718", "bow"], 
+    "457": ["n02883205", "bow_tie"], 
+    "458": ["n02892201", "brass"], 
+    "459": ["n02892767", "brassiere"], 
+    "460": ["n02894605", "breakwater"], 
+    "461": ["n02895154", "breastplate"], 
+    "462": ["n02906734", "broom"], 
+    "463": ["n02909870", "bucket"], 
+    "464": ["n02910353", "buckle"], 
+    "465": ["n02916936", "bulletproof_vest"], 
+    "466": ["n02917067", "bullet_train"], 
+    "467": ["n02927161", "butcher_shop"], 
+    "468": ["n02930766", "cab"], 
+    "469": ["n02939185", "caldron"], 
+    "470": ["n02948072", "candle"], 
+    "471": ["n02950826", "cannon"], 
+    "472": ["n02951358", "canoe"], 
+    "473": ["n02951585", "can_opener"], 
+    "474": ["n02963159", "cardigan"], 
+    "475": ["n02965783", "car_mirror"], 
+    "476": ["n02966193", "carousel"], 
+    "477": ["n02966687", "carpenter's_kit"], 
+    "478": ["n02971356", "carton"], 
+    "479": ["n02974003", "car_wheel"], 
+    "480": ["n02977058", "cash_machine"], 
+    "481": ["n02978881", "cassette"], 
+    "482": ["n02979186", "cassette_player"], 
+    "483": ["n02980441", "castle"], 
+    "484": ["n02981792", "catamaran"], 
+    "485": ["n02988304", "CD_player"], 
+    "486": ["n02992211", "cello"], 
+    "487": ["n02992529", "cellular_telephone"], 
+    "488": ["n02999410", "chain"], 
+    "489": ["n03000134", "chainlink_fence"], 
+    "490": ["n03000247", "chain_mail"], 
+    "491": ["n03000684", "chain_saw"], 
+    "492": ["n03014705", "chest"], 
+    "493": ["n03016953", "chiffonier"], 
+    "494": ["n03017168", "chime"], 
+    "495": ["n03018349", "china_cabinet"], 
+    "496": ["n03026506", "Christmas_stocking"], 
+    "497": ["n03028079", "church"], 
+    "498": ["n03032252", "cinema"], 
+    "499": ["n03041632", "cleaver"], 
+    "500": ["n03042490", "cliff_dwelling"], 
+    "501": ["n03045698", "cloak"], 
+    "502": ["n03047690", "clog"], 
+    "503": ["n03062245", "cocktail_shaker"], 
+    "504": ["n03063599", "coffee_mug"], 
+    "505": ["n03063689", "coffeepot"], 
+    "506": ["n03065424", "coil"], 
+    "507": ["n03075370", "combination_lock"], 
+    "508": ["n03085013", "computer_keyboard"], 
+    "509": ["n03089624", "confectionery"], 
+    "510": ["n03095699", "container_ship"], 
+    "511": ["n03100240", "convertible"], 
+    "512": ["n03109150", "corkscrew"], 
+    "513": ["n03110669", "cornet"], 
+    "514": ["n03124043", "cowboy_boot"], 
+    "515": ["n03124170", "cowboy_hat"], 
+    "516": ["n03125729", "cradle"], 
+    "517": ["n03126707", "crane"], 
+    "518": ["n03127747", "crash_helmet"], 
+    "519": ["n03127925", "crate"], 
+    "520": ["n03131574", "crib"], 
+    "521": ["n03133878", "Crock_Pot"], 
+    "522": ["n03134739", "croquet_ball"], 
+    "523": ["n03141823", "crutch"], 
+    "524": ["n03146219", "cuirass"], 
+    "525": ["n03160309", "dam"], 
+    "526": ["n03179701", "desk"], 
+    "527": ["n03180011", "desktop_computer"], 
+    "528": ["n03187595", "dial_telephone"], 
+    "529": ["n03188531", "diaper"], 
+    "530": ["n03196217", "digital_clock"], 
+    "531": ["n03197337", "digital_watch"], 
+    "532": ["n03201208", "dining_table"], 
+    "533": ["n03207743", "dishrag"], 
+    "534": ["n03207941", "dishwasher"], 
+    "535": ["n03208938", "disk_brake"], 
+    "536": ["n03216828", "dock"], 
+    "537": ["n03218198", "dogsled"], 
+    "538": ["n03220513", "dome"], 
+    "539": ["n03223299", "doormat"], 
+    "540": ["n03240683", "drilling_platform"], 
+    "541": ["n03249569", "drum"], 
+    "542": ["n03250847", "drumstick"], 
+    "543": ["n03255030", "dumbbell"], 
+    "544": ["n03259280", "Dutch_oven"], 
+    "545": ["n03271574", "electric_fan"], 
+    "546": ["n03272010", "electric_guitar"], 
+    "547": ["n03272562", "electric_locomotive"], 
+    "548": ["n03290653", "entertainment_center"], 
+    "549": ["n03291819", "envelope"], 
+    "550": ["n03297495", "espresso_maker"], 
+    "551": ["n03314780", "face_powder"], 
+    "552": ["n03325584", "feather_boa"], 
+    "553": ["n03337140", "file"], 
+    "554": ["n03344393", "fireboat"], 
+    "555": ["n03345487", "fire_engine"], 
+    "556": ["n03347037", "fire_screen"], 
+    "557": ["n03355925", "flagpole"], 
+    "558": ["n03372029", "flute"], 
+    "559": ["n03376595", "folding_chair"], 
+    "560": ["n03379051", "football_helmet"], 
+    "561": ["n03384352", "forklift"], 
+    "562": ["n03388043", "fountain"], 
+    "563": ["n03388183", "fountain_pen"], 
+    "564": ["n03388549", "four-poster"], 
+    "565": ["n03393912", "freight_car"], 
+    "566": ["n03394916", "French_horn"], 
+    "567": ["n03400231", "frying_pan"], 
+    "568": ["n03404251", "fur_coat"], 
+    "569": ["n03417042", "garbage_truck"], 
+    "570": ["n03424325", "gasmask"], 
+    "571": ["n03425413", "gas_pump"], 
+    "572": ["n03443371", "goblet"], 
+    "573": ["n03444034", "go-kart"], 
+    "574": ["n03445777", "golf_ball"], 
+    "575": ["n03445924", "golfcart"], 
+    "576": ["n03447447", "gondola"], 
+    "577": ["n03447721", "gong"], 
+    "578": ["n03450230", "gown"], 
+    "579": ["n03452741", "grand_piano"], 
+    "580": ["n03457902", "greenhouse"], 
+    "581": ["n03459775", "grille"], 
+    "582": ["n03461385", "grocery_store"], 
+    "583": ["n03467068", "guillotine"], 
+    "584": ["n03476684", "hair_slide"], 
+    "585": ["n03476991", "hair_spray"], 
+    "586": ["n03478589", "half_track"], 
+    "587": ["n03481172", "hammer"], 
+    "588": ["n03482405", "hamper"], 
+    "589": ["n03483316", "hand_blower"], 
+    "590": ["n03485407", "hand-held_computer"], 
+    "591": ["n03485794", "handkerchief"], 
+    "592": ["n03492542", "hard_disc"], 
+    "593": ["n03494278", "harmonica"], 
+    "594": ["n03495258", "harp"], 
+    "595": ["n03496892", "harvester"], 
+    "596": ["n03498962", "hatchet"], 
+    "597": ["n03527444", "holster"], 
+    "598": ["n03529860", "home_theater"], 
+    "599": ["n03530642", "honeycomb"], 
+    "600": ["n03532672", "hook"], 
+    "601": ["n03534580", "hoopskirt"], 
+    "602": ["n03535780", "horizontal_bar"], 
+    "603": ["n03538406", "horse_cart"], 
+    "604": ["n03544143", "hourglass"], 
+    "605": ["n03584254", "iPod"], 
+    "606": ["n03584829", "iron"], 
+    "607": ["n03590841", "jack-o'-lantern"], 
+    "608": ["n03594734", "jean"], 
+    "609": ["n03594945", "jeep"], 
+    "610": ["n03595614", "jersey"], 
+    "611": ["n03598930", "jigsaw_puzzle"], 
+    "612": ["n03599486", "jinrikisha"], 
+    "613": ["n03602883", "joystick"], 
+    "614": ["n03617480", "kimono"], 
+    "615": ["n03623198", "knee_pad"], 
+    "616": ["n03627232", "knot"], 
+    "617": ["n03630383", "lab_coat"], 
+    "618": ["n03633091", "ladle"], 
+    "619": ["n03637318", "lampshade"], 
+    "620": ["n03642806", "laptop"], 
+    "621": ["n03649909", "lawn_mower"], 
+    "622": ["n03657121", "lens_cap"], 
+    "623": ["n03658185", "letter_opener"], 
+    "624": ["n03661043", "library"], 
+    "625": ["n03662601", "lifeboat"], 
+    "626": ["n03666591", "lighter"], 
+    "627": ["n03670208", "limousine"], 
+    "628": ["n03673027", "liner"], 
+    "629": ["n03676483", "lipstick"], 
+    "630": ["n03680355", "Loafer"], 
+    "631": ["n03690938", "lotion"], 
+    "632": ["n03691459", "loudspeaker"], 
+    "633": ["n03692522", "loupe"], 
+    "634": ["n03697007", "lumbermill"], 
+    "635": ["n03706229", "magnetic_compass"], 
+    "636": ["n03709823", "mailbag"], 
+    "637": ["n03710193", "mailbox"], 
+    "638": ["n03710637", "maillot"], 
+    "639": ["n03710721", "maillot"], 
+    "640": ["n03717622", "manhole_cover"], 
+    "641": ["n03720891", "maraca"], 
+    "642": ["n03721384", "marimba"], 
+    "643": ["n03724870", "mask"], 
+    "644": ["n03729826", "matchstick"], 
+    "645": ["n03733131", "maypole"], 
+    "646": ["n03733281", "maze"], 
+    "647": ["n03733805", "measuring_cup"], 
+    "648": ["n03742115", "medicine_chest"], 
+    "649": ["n03743016", "megalith"], 
+    "650": ["n03759954", "microphone"], 
+    "651": ["n03761084", "microwave"], 
+    "652": ["n03763968", "military_uniform"], 
+    "653": ["n03764736", "milk_can"], 
+    "654": ["n03769881", "minibus"], 
+    "655": ["n03770439", "miniskirt"], 
+    "656": ["n03770679", "minivan"], 
+    "657": ["n03773504", "missile"], 
+    "658": ["n03775071", "mitten"], 
+    "659": ["n03775546", "mixing_bowl"], 
+    "660": ["n03776460", "mobile_home"], 
+    "661": ["n03777568", "Model_T"], 
+    "662": ["n03777754", "modem"], 
+    "663": ["n03781244", "monastery"], 
+    "664": ["n03782006", "monitor"], 
+    "665": ["n03785016", "moped"], 
+    "666": ["n03786901", "mortar"], 
+    "667": ["n03787032", "mortarboard"], 
+    "668": ["n03788195", "mosque"], 
+    "669": ["n03788365", "mosquito_net"], 
+    "670": ["n03791053", "motor_scooter"], 
+    "671": ["n03792782", "mountain_bike"], 
+    "672": ["n03792972", "mountain_tent"], 
+    "673": ["n03793489", "mouse"], 
+    "674": ["n03794056", "mousetrap"], 
+    "675": ["n03796401", "moving_van"], 
+    "676": ["n03803284", "muzzle"], 
+    "677": ["n03804744", "nail"], 
+    "678": ["n03814639", "neck_brace"], 
+    "679": ["n03814906", "necklace"], 
+    "680": ["n03825788", "nipple"], 
+    "681": ["n03832673", "notebook"], 
+    "682": ["n03837869", "obelisk"], 
+    "683": ["n03838899", "oboe"], 
+    "684": ["n03840681", "ocarina"], 
+    "685": ["n03841143", "odometer"], 
+    "686": ["n03843555", "oil_filter"], 
+    "687": ["n03854065", "organ"], 
+    "688": ["n03857828", "oscilloscope"], 
+    "689": ["n03866082", "overskirt"], 
+    "690": ["n03868242", "oxcart"], 
+    "691": ["n03868863", "oxygen_mask"], 
+    "692": ["n03871628", "packet"], 
+    "693": ["n03873416", "paddle"], 
+    "694": ["n03874293", "paddlewheel"], 
+    "695": ["n03874599", "padlock"], 
+    "696": ["n03876231", "paintbrush"], 
+    "697": ["n03877472", "pajama"], 
+    "698": ["n03877845", "palace"], 
+    "699": ["n03884397", "panpipe"], 
+    "700": ["n03887697", "paper_towel"], 
+    "701": ["n03888257", "parachute"], 
+    "702": ["n03888605", "parallel_bars"], 
+    "703": ["n03891251", "park_bench"], 
+    "704": ["n03891332", "parking_meter"], 
+    "705": ["n03895866", "passenger_car"], 
+    "706": ["n03899768", "patio"], 
+    "707": ["n03902125", "pay-phone"], 
+    "708": ["n03903868", "pedestal"], 
+    "709": ["n03908618", "pencil_box"], 
+    "710": ["n03908714", "pencil_sharpener"], 
+    "711": ["n03916031", "perfume"], 
+    "712": ["n03920288", "Petri_dish"], 
+    "713": ["n03924679", "photocopier"], 
+    "714": ["n03929660", "pick"], 
+    "715": ["n03929855", "pickelhaube"], 
+    "716": ["n03930313", "picket_fence"], 
+    "717": ["n03930630", "pickup"], 
+    "718": ["n03933933", "pier"], 
+    "719": ["n03935335", "piggy_bank"], 
+    "720": ["n03937543", "pill_bottle"], 
+    "721": ["n03938244", "pillow"], 
+    "722": ["n03942813", "ping-pong_ball"], 
+    "723": ["n03944341", "pinwheel"], 
+    "724": ["n03947888", "pirate"], 
+    "725": ["n03950228", "pitcher"], 
+    "726": ["n03954731", "plane"], 
+    "727": ["n03956157", "planetarium"], 
+    "728": ["n03958227", "plastic_bag"], 
+    "729": ["n03961711", "plate_rack"], 
+    "730": ["n03967562", "plow"], 
+    "731": ["n03970156", "plunger"], 
+    "732": ["n03976467", "Polaroid_camera"], 
+    "733": ["n03976657", "pole"], 
+    "734": ["n03977966", "police_van"], 
+    "735": ["n03980874", "poncho"], 
+    "736": ["n03982430", "pool_table"], 
+    "737": ["n03983396", "pop_bottle"], 
+    "738": ["n03991062", "pot"], 
+    "739": ["n03992509", "potter's_wheel"], 
+    "740": ["n03995372", "power_drill"], 
+    "741": ["n03998194", "prayer_rug"], 
+    "742": ["n04004767", "printer"], 
+    "743": ["n04005630", "prison"], 
+    "744": ["n04008634", "projectile"], 
+    "745": ["n04009552", "projector"], 
+    "746": ["n04019541", "puck"], 
+    "747": ["n04023962", "punching_bag"], 
+    "748": ["n04026417", "purse"], 
+    "749": ["n04033901", "quill"], 
+    "750": ["n04033995", "quilt"], 
+    "751": ["n04037443", "racer"], 
+    "752": ["n04039381", "racket"], 
+    "753": ["n04040759", "radiator"], 
+    "754": ["n04041544", "radio"], 
+    "755": ["n04044716", "radio_telescope"], 
+    "756": ["n04049303", "rain_barrel"], 
+    "757": ["n04065272", "recreational_vehicle"], 
+    "758": ["n04067472", "reel"], 
+    "759": ["n04069434", "reflex_camera"], 
+    "760": ["n04070727", "refrigerator"], 
+    "761": ["n04074963", "remote_control"], 
+    "762": ["n04081281", "restaurant"], 
+    "763": ["n04086273", "revolver"], 
+    "764": ["n04090263", "rifle"], 
+    "765": ["n04099969", "rocking_chair"], 
+    "766": ["n04111531", "rotisserie"], 
+    "767": ["n04116512", "rubber_eraser"], 
+    "768": ["n04118538", "rugby_ball"], 
+    "769": ["n04118776", "rule"], 
+    "770": ["n04120489", "running_shoe"], 
+    "771": ["n04125021", "safe"], 
+    "772": ["n04127249", "safety_pin"], 
+    "773": ["n04131690", "saltshaker"], 
+    "774": ["n04133789", "sandal"], 
+    "775": ["n04136333", "sarong"], 
+    "776": ["n04141076", "sax"], 
+    "777": ["n04141327", "scabbard"], 
+    "778": ["n04141975", "scale"], 
+    "779": ["n04146614", "school_bus"], 
+    "780": ["n04147183", "schooner"], 
+    "781": ["n04149813", "scoreboard"], 
+    "782": ["n04152593", "screen"], 
+    "783": ["n04153751", "screw"], 
+    "784": ["n04154565", "screwdriver"], 
+    "785": ["n04162706", "seat_belt"], 
+    "786": ["n04179913", "sewing_machine"], 
+    "787": ["n04192698", "shield"], 
+    "788": ["n04200800", "shoe_shop"], 
+    "789": ["n04201297", "shoji"], 
+    "790": ["n04204238", "shopping_basket"], 
+    "791": ["n04204347", "shopping_cart"], 
+    "792": ["n04208210", "shovel"], 
+    "793": ["n04209133", "shower_cap"], 
+    "794": ["n04209239", "shower_curtain"], 
+    "795": ["n04228054", "ski"], 
+    "796": ["n04229816", "ski_mask"], 
+    "797": ["n04235860", "sleeping_bag"], 
+    "798": ["n04238763", "slide_rule"], 
+    "799": ["n04239074", "sliding_door"], 
+    "800": ["n04243546", "slot"], 
+    "801": ["n04251144", "snorkel"], 
+    "802": ["n04252077", "snowmobile"], 
+    "803": ["n04252225", "snowplow"], 
+    "804": ["n04254120", "soap_dispenser"], 
+    "805": ["n04254680", "soccer_ball"], 
+    "806": ["n04254777", "sock"], 
+    "807": ["n04258138", "solar_dish"], 
+    "808": ["n04259630", "sombrero"], 
+    "809": ["n04263257", "soup_bowl"], 
+    "810": ["n04264628", "space_bar"], 
+    "811": ["n04265275", "space_heater"], 
+    "812": ["n04266014", "space_shuttle"], 
+    "813": ["n04270147", "spatula"], 
+    "814": ["n04273569", "speedboat"], 
+    "815": ["n04275548", "spider_web"], 
+    "816": ["n04277352", "spindle"], 
+    "817": ["n04285008", "sports_car"], 
+    "818": ["n04286575", "spotlight"], 
+    "819": ["n04296562", "stage"], 
+    "820": ["n04310018", "steam_locomotive"], 
+    "821": ["n04311004", "steel_arch_bridge"], 
+    "822": ["n04311174", "steel_drum"], 
+    "823": ["n04317175", "stethoscope"], 
+    "824": ["n04325704", "stole"], 
+    "825": ["n04326547", "stone_wall"], 
+    "826": ["n04328186", "stopwatch"], 
+    "827": ["n04330267", "stove"], 
+    "828": ["n04332243", "strainer"], 
+    "829": ["n04335435", "streetcar"], 
+    "830": ["n04336792", "stretcher"], 
+    "831": ["n04344873", "studio_couch"], 
+    "832": ["n04346328", "stupa"], 
+    "833": ["n04347754", "submarine"], 
+    "834": ["n04350905", "suit"], 
+    "835": ["n04355338", "sundial"], 
+    "836": ["n04355933", "sunglass"], 
+    "837": ["n04356056", "sunglasses"], 
+    "838": ["n04357314", "sunscreen"], 
+    "839": ["n04366367", "suspension_bridge"], 
+    "840": ["n04367480", "swab"], 
+    "841": ["n04370456", "sweatshirt"], 
+    "842": ["n04371430", "swimming_trunks"], 
+    "843": ["n04371774", "swing"], 
+    "844": ["n04372370", "switch"], 
+    "845": ["n04376876", "syringe"], 
+    "846": ["n04380533", "table_lamp"], 
+    "847": ["n04389033", "tank"], 
+    "848": ["n04392985", "tape_player"], 
+    "849": ["n04398044", "teapot"], 
+    "850": ["n04399382", "teddy"], 
+    "851": ["n04404412", "television"], 
+    "852": ["n04409515", "tennis_ball"], 
+    "853": ["n04417672", "thatch"], 
+    "854": ["n04418357", "theater_curtain"], 
+    "855": ["n04423845", "thimble"], 
+    "856": ["n04428191", "thresher"], 
+    "857": ["n04429376", "throne"], 
+    "858": ["n04435653", "tile_roof"], 
+    "859": ["n04442312", "toaster"], 
+    "860": ["n04443257", "tobacco_shop"], 
+    "861": ["n04447861", "toilet_seat"], 
+    "862": ["n04456115", "torch"], 
+    "863": ["n04458633", "totem_pole"], 
+    "864": ["n04461696", "tow_truck"], 
+    "865": ["n04462240", "toyshop"], 
+    "866": ["n04465501", "tractor"], 
+    "867": ["n04467665", "trailer_truck"], 
+    "868": ["n04476259", "tray"], 
+    "869": ["n04479046", "trench_coat"], 
+    "870": ["n04482393", "tricycle"], 
+    "871": ["n04483307", "trimaran"], 
+    "872": ["n04485082", "tripod"], 
+    "873": ["n04486054", "triumphal_arch"], 
+    "874": ["n04487081", "trolleybus"], 
+    "875": ["n04487394", "trombone"], 
+    "876": ["n04493381", "tub"], 
+    "877": ["n04501370", "turnstile"], 
+    "878": ["n04505470", "typewriter_keyboard"], 
+    "879": ["n04507155", "umbrella"], 
+    "880": ["n04509417", "unicycle"], 
+    "881": ["n04515003", "upright"], 
+    "882": ["n04517823", "vacuum"], 
+    "883": ["n04522168", "vase"], 
+    "884": ["n04523525", "vault"], 
+    "885": ["n04525038", "velvet"], 
+    "886": ["n04525305", "vending_machine"], 
+    "887": ["n04532106", "vestment"], 
+    "888": ["n04532670", "viaduct"], 
+    "889": ["n04536866", "violin"], 
+    "890": ["n04540053", "volleyball"], 
+    "891": ["n04542943", "waffle_iron"], 
+    "892": ["n04548280", "wall_clock"], 
+    "893": ["n04548362", "wallet"], 
+    "894": ["n04550184", "wardrobe"], 
+    "895": ["n04552348", "warplane"], 
+    "896": ["n04553703", "washbasin"], 
+    "897": ["n04554684", "washer"], 
+    "898": ["n04557648", "water_bottle"], 
+    "899": ["n04560804", "water_jug"], 
+    "900": ["n04562935", "water_tower"], 
+    "901": ["n04579145", "whiskey_jug"], 
+    "902": ["n04579432", "whistle"], 
+    "903": ["n04584207", "wig"], 
+    "904": ["n04589890", "window_screen"], 
+    "905": ["n04590129", "window_shade"], 
+    "906": ["n04591157", "Windsor_tie"], 
+    "907": ["n04591713", "wine_bottle"], 
+    "908": ["n04592741", "wing"], 
+    "909": ["n04596742", "wok"], 
+    "910": ["n04597913", "wooden_spoon"], 
+    "911": ["n04599235", "wool"], 
+    "912": ["n04604644", "worm_fence"], 
+    "913": ["n04606251", "wreck"], 
+    "914": ["n04612504", "yawl"], 
+    "915": ["n04613696", "yurt"], 
+    "916": ["n06359193", "web_site"], 
+    "917": ["n06596364", "comic_book"], 
+    "918": ["n06785654", "crossword_puzzle"], 
+    "919": ["n06794110", "street_sign"], 
+    "920": ["n06874185", "traffic_light"], 
+    "921": ["n07248320", "book_jacket"], 
+    "922": ["n07565083", "menu"], 
+    "923": ["n07579787", "plate"], 
+    "924": ["n07583066", "guacamole"], 
+    "925": ["n07584110", "consomme"], 
+    "926": ["n07590611", "hot_pot"], 
+    "927": ["n07613480", "trifle"], 
+    "928": ["n07614500", "ice_cream"], 
+    "929": ["n07615774", "ice_lolly"], 
+    "930": ["n07684084", "French_loaf"], 
+    "931": ["n07693725", "bagel"], 
+    "932": ["n07695742", "pretzel"], 
+    "933": ["n07697313", "cheeseburger"], 
+    "934": ["n07697537", "hotdog"], 
+    "935": ["n07711569", "mashed_potato"], 
+    "936": ["n07714571", "head_cabbage"], 
+    "937": ["n07714990", "broccoli"], 
+    "938": ["n07715103", "cauliflower"], 
+    "939": ["n07716358", "zucchini"], 
+    "940": ["n07716906", "spaghetti_squash"], 
+    "941": ["n07717410", "acorn_squash"], 
+    "942": ["n07717556", "butternut_squash"], 
+    "943": ["n07718472", "cucumber"], 
+    "944": ["n07718747", "artichoke"], 
+    "945": ["n07720875", "bell_pepper"], 
+    "946": ["n07730033", "cardoon"], 
+    "947": ["n07734744", "mushroom"], 
+    "948": ["n07742313", "Granny_Smith"], 
+    "949": ["n07745940", "strawberry"], 
+    "950": ["n07747607", "orange"], 
+    "951": ["n07749582", "lemon"], 
+    "952": ["n07753113", "fig"], 
+    "953": ["n07753275", "pineapple"], 
+    "954": ["n07753592", "banana"], 
+    "955": ["n07754684", "jackfruit"], 
+    "956": ["n07760859", "custard_apple"], 
+    "957": ["n07768694", "pomegranate"], 
+    "958": ["n07802026", "hay"], 
+    "959": ["n07831146", "carbonara"], 
+    "960": ["n07836838", "chocolate_sauce"], 
+    "961": ["n07860988", "dough"], 
+    "962": ["n07871810", "meat_loaf"], 
+    "963": ["n07873807", "pizza"], 
+    "964": ["n07875152", "potpie"], 
+    "965": ["n07880968", "burrito"], 
+    "966": ["n07892512", "red_wine"], 
+    "967": ["n07920052", "espresso"], 
+    "968": ["n07930864", "cup"], 
+    "969": ["n07932039", "eggnog"], 
+    "970": ["n09193705", "alp"], 
+    "971": ["n09229709", "bubble"], 
+    "972": ["n09246464", "cliff"], 
+    "973": ["n09256479", "coral_reef"], 
+    "974": ["n09288635", "geyser"], 
+    "975": ["n09332890", "lakeside"], 
+    "976": ["n09399592", "promontory"], 
+    "977": ["n09421951", "sandbar"], 
+    "978": ["n09428293", "seashore"], 
+    "979": ["n09468604", "valley"], 
+    "980": ["n09472597", "volcano"], 
+    "981": ["n09835506", "ballplayer"], 
+    "982": ["n10148035", "groom"], 
+    "983": ["n10565667", "scuba_diver"], 
+    "984": ["n11879895", "rapeseed"], 
+    "985": ["n11939491", "daisy"], 
+    "986": ["n12057211", "yellow_lady's_slipper"], 
+    "987": ["n12144580", "corn"], 
+    "988": ["n12267677", "acorn"], 
+    "989": ["n12620546", "hip"], 
+    "990": ["n12768682", "buckeye"], 
+    "991": ["n12985857", "coral_fungus"], 
+    "992": ["n12998815", "agaric"], 
+    "993": ["n13037406", "gyromitra"], 
+    "994": ["n13040303", "stinkhorn"], 
+    "995": ["n13044778", "earthstar"], 
+    "996": ["n13052670", "hen-of-the-woods"], 
+    "997": ["n13054560", "bolete"], 
+    "998": ["n13133613", "ear"], 
+    "999": ["n15075141", "toilet_tissue"]
+}
\ No newline at end of file
diff --git a/images/cat.png b/images/cat.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c869201dbfa331abf49c21435186d96d8a9f18b
Binary files /dev/null and b/images/cat.png differ
diff --git a/images/dog.png b/images/dog.png
new file mode 100644
index 0000000000000000000000000000000000000000..a59f1ca272f13e2df0453871311fc84ab0d5da24
Binary files /dev/null and b/images/dog.png differ
diff --git a/images/panda.png b/images/panda.png
new file mode 100644
index 0000000000000000000000000000000000000000..b43f4b5c1cc135f45f4056ef8912fdc4ce5b2571
Binary files /dev/null and b/images/panda.png differ
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..034b4ecad14b2670f72e8b4fe52c6acf57686489
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,2 @@
+pip install -e causal-conv1d
+pip install -e mamba
\ No newline at end of file
diff --git a/kinetics_class_index.py b/kinetics_class_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..597e23e72c690f2dce0525b24bdcc2a992c4d594
--- /dev/null
+++ b/kinetics_class_index.py
@@ -0,0 +1,402 @@
+kinetics_classnames = {
+    "0": "riding a bike", 
+    "1": "marching", 
+    "2": "dodgeball", 
+    "3": "playing cymbals", 
+    "4": "checking tires", 
+    "5": "roller skating", 
+    "6": "tasting beer", 
+    "7": "clapping", 
+    "8": "drawing", 
+    "9": "juggling fire", 
+    "10": "bobsledding", 
+    "11": "petting animal (not cat)", 
+    "12": "spray painting", 
+    "13": "training dog", 
+    "14": "eating watermelon", 
+    "15": "building cabinet", 
+    "16": "applauding", 
+    "17": "playing harp", 
+    "18": "balloon blowing", 
+    "19": "sled dog racing", 
+    "20": "wrestling", 
+    "21": "pole vault", 
+    "22": "hurling (sport)", 
+    "23": "riding scooter", 
+    "24": "shearing sheep", 
+    "25": "sweeping floor", 
+    "26": "eating carrots", 
+    "27": "skateboarding", 
+    "28": "dunking basketball", 
+    "29": "disc golfing", 
+    "30": "eating spaghetti", 
+    "31": "playing flute", 
+    "32": "riding mechanical bull", 
+    "33": "making sushi", 
+    "34": "trapezing", 
+    "35": "picking fruit", 
+    "36": "stretching leg", 
+    "37": "playing ukulele", 
+    "38": "tying tie", 
+    "39": "skydiving", 
+    "40": "playing cello", 
+    "41": "jumping into pool", 
+    "42": "shooting goal (soccer)", 
+    "43": "trimming trees", 
+    "44": "bookbinding", 
+    "45": "ski jumping", 
+    "46": "walking the dog", 
+    "47": "riding unicycle", 
+    "48": "shaving head", 
+    "49": "hopscotch", 
+    "50": "playing piano", 
+    "51": "parasailing", 
+    "52": "bartending", 
+    "53": "kicking field goal", 
+    "54": "finger snapping", 
+    "55": "dining", 
+    "56": "yawning", 
+    "57": "peeling potatoes", 
+    "58": "canoeing or kayaking", 
+    "59": "front raises", 
+    "60": "laughing", 
+    "61": "dancing macarena", 
+    "62": "digging", 
+    "63": "reading newspaper", 
+    "64": "hitting baseball", 
+    "65": "clay pottery making", 
+    "66": "exercising with an exercise ball", 
+    "67": "playing saxophone", 
+    "68": "shooting basketball", 
+    "69": "washing hair", 
+    "70": "lunge", 
+    "71": "brushing hair", 
+    "72": "curling hair", 
+    "73": "kitesurfing", 
+    "74": "tapping guitar", 
+    "75": "bending back", 
+    "76": "skipping rope", 
+    "77": "situp", 
+    "78": "folding paper", 
+    "79": "cracking neck", 
+    "80": "assembling computer", 
+    "81": "cleaning gutters", 
+    "82": "blowing out candles", 
+    "83": "shaking hands", 
+    "84": "dancing gangnam style", 
+    "85": "windsurfing", 
+    "86": "tap dancing", 
+    "87": "skiing (not slalom or crosscountry)", 
+    "88": "bandaging", 
+    "89": "push up", 
+    "90": "doing nails", 
+    "91": "punching person (boxing)", 
+    "92": "bouncing on trampoline", 
+    "93": "scrambling eggs", 
+    "94": "singing", 
+    "95": "cleaning floor", 
+    "96": "krumping", 
+    "97": "drumming fingers", 
+    "98": "snowmobiling", 
+    "99": "gymnastics tumbling", 
+    "100": "headbanging", 
+    "101": "catching or throwing frisbee", 
+    "102": "riding elephant", 
+    "103": "bee keeping", 
+    "104": "feeding birds", 
+    "105": "snatch weight lifting", 
+    "106": "mowing lawn", 
+    "107": "fixing hair", 
+    "108": "playing trumpet", 
+    "109": "flying kite", 
+    "110": "crossing river", 
+    "111": "swinging legs", 
+    "112": "sanding floor", 
+    "113": "belly dancing", 
+    "114": "sneezing", 
+    "115": "clean and jerk", 
+    "116": "side kick", 
+    "117": "filling eyebrows", 
+    "118": "shuffling cards", 
+    "119": "recording music", 
+    "120": "cartwheeling", 
+    "121": "feeding fish", 
+    "122": "folding clothes", 
+    "123": "water skiing", 
+    "124": "tobogganing", 
+    "125": "blowing leaves", 
+    "126": "smoking", 
+    "127": "unboxing", 
+    "128": "tai chi", 
+    "129": "waxing legs", 
+    "130": "riding camel", 
+    "131": "slapping", 
+    "132": "tossing salad", 
+    "133": "capoeira", 
+    "134": "playing cards", 
+    "135": "playing organ", 
+    "136": "playing violin", 
+    "137": "playing drums", 
+    "138": "tapping pen", 
+    "139": "vault", 
+    "140": "shoveling snow", 
+    "141": "playing tennis", 
+    "142": "getting a tattoo", 
+    "143": "making a sandwich", 
+    "144": "making tea", 
+    "145": "grinding meat", 
+    "146": "squat", 
+    "147": "eating doughnuts", 
+    "148": "ice fishing", 
+    "149": "snowkiting", 
+    "150": "kicking soccer ball", 
+    "151": "playing controller", 
+    "152": "giving or receiving award", 
+    "153": "welding", 
+    "154": "throwing discus", 
+    "155": "throwing axe", 
+    "156": "ripping paper", 
+    "157": "swimming butterfly stroke", 
+    "158": "air drumming", 
+    "159": "blowing nose", 
+    "160": "hockey stop", 
+    "161": "taking a shower", 
+    "162": "bench pressing", 
+    "163": "planting trees", 
+    "164": "pumping fist", 
+    "165": "climbing tree", 
+    "166": "tickling", 
+    "167": "high kick", 
+    "168": "waiting in line", 
+    "169": "slacklining", 
+    "170": "tango dancing", 
+    "171": "hurdling", 
+    "172": "carrying baby", 
+    "173": "celebrating", 
+    "174": "sharpening knives", 
+    "175": "passing American football (in game)", 
+    "176": "headbutting", 
+    "177": "playing recorder", 
+    "178": "brush painting", 
+    "179": "garbage collecting", 
+    "180": "robot dancing", 
+    "181": "shredding paper", 
+    "182": "pumping gas", 
+    "183": "rock climbing", 
+    "184": "hula hooping", 
+    "185": "braiding hair", 
+    "186": "opening present", 
+    "187": "texting", 
+    "188": "decorating the christmas tree", 
+    "189": "answering questions", 
+    "190": "playing keyboard", 
+    "191": "writing", 
+    "192": "bungee jumping", 
+    "193": "sniffing", 
+    "194": "eating burger", 
+    "195": "playing accordion", 
+    "196": "making pizza", 
+    "197": "playing volleyball", 
+    "198": "tasting food", 
+    "199": "pushing cart", 
+    "200": "spinning poi", 
+    "201": "cleaning windows", 
+    "202": "arm wrestling", 
+    "203": "changing oil", 
+    "204": "swimming breast stroke", 
+    "205": "tossing coin", 
+    "206": "deadlifting", 
+    "207": "hoverboarding", 
+    "208": "cutting watermelon", 
+    "209": "cheerleading", 
+    "210": "snorkeling", 
+    "211": "washing hands", 
+    "212": "eating cake", 
+    "213": "pull ups", 
+    "214": "surfing water", 
+    "215": "eating hotdog", 
+    "216": "holding snake", 
+    "217": "playing harmonica", 
+    "218": "ironing", 
+    "219": "cutting nails", 
+    "220": "golf chipping", 
+    "221": "shot put", 
+    "222": "hugging", 
+    "223": "playing clarinet", 
+    "224": "faceplanting", 
+    "225": "trimming or shaving beard", 
+    "226": "drinking shots", 
+    "227": "riding mountain bike", 
+    "228": "tying bow tie", 
+    "229": "swinging on something", 
+    "230": "skiing crosscountry", 
+    "231": "unloading truck", 
+    "232": "cleaning pool", 
+    "233": "jogging", 
+    "234": "ice climbing", 
+    "235": "mopping floor", 
+    "236": "making bed", 
+    "237": "diving cliff", 
+    "238": "washing dishes", 
+    "239": "grooming dog", 
+    "240": "weaving basket", 
+    "241": "frying vegetables", 
+    "242": "stomping grapes", 
+    "243": "moving furniture", 
+    "244": "cooking sausages", 
+    "245": "doing laundry", 
+    "246": "dying hair", 
+    "247": "knitting", 
+    "248": "reading book", 
+    "249": "baby waking up", 
+    "250": "punching bag", 
+    "251": "surfing crowd", 
+    "252": "cooking chicken", 
+    "253": "pushing car", 
+    "254": "springboard diving", 
+    "255": "swing dancing", 
+    "256": "massaging legs", 
+    "257": "beatboxing", 
+    "258": "breading or breadcrumbing", 
+    "259": "somersaulting", 
+    "260": "brushing teeth", 
+    "261": "stretching arm", 
+    "262": "juggling balls", 
+    "263": "massaging person's head", 
+    "264": "eating ice cream", 
+    "265": "extinguishing fire", 
+    "266": "hammer throw", 
+    "267": "whistling", 
+    "268": "crawling baby", 
+    "269": "using remote controller (not gaming)", 
+    "270": "playing cricket", 
+    "271": "opening bottle", 
+    "272": "playing xylophone", 
+    "273": "motorcycling", 
+    "274": "driving car", 
+    "275": "exercising arm", 
+    "276": "passing American football (not in game)", 
+    "277": "playing kickball", 
+    "278": "sticking tongue out", 
+    "279": "flipping pancake", 
+    "280": "catching fish", 
+    "281": "eating chips", 
+    "282": "shaking head", 
+    "283": "sword fighting", 
+    "284": "playing poker", 
+    "285": "cooking on campfire", 
+    "286": "doing aerobics", 
+    "287": "paragliding", 
+    "288": "using segway", 
+    "289": "folding napkins", 
+    "290": "playing bagpipes", 
+    "291": "gargling", 
+    "292": "skiing slalom", 
+    "293": "strumming guitar", 
+    "294": "javelin throw", 
+    "295": "waxing back", 
+    "296": "riding or walking with horse", 
+    "297": "plastering", 
+    "298": "long jump", 
+    "299": "parkour", 
+    "300": "wrapping present", 
+    "301": "egg hunting", 
+    "302": "archery", 
+    "303": "cleaning toilet", 
+    "304": "swimming backstroke", 
+    "305": "snowboarding", 
+    "306": "catching or throwing baseball", 
+    "307": "massaging back", 
+    "308": "blowing glass", 
+    "309": "playing guitar", 
+    "310": "playing chess", 
+    "311": "golf driving", 
+    "312": "presenting weather forecast", 
+    "313": "rock scissors paper", 
+    "314": "high jump", 
+    "315": "baking cookies", 
+    "316": "using computer", 
+    "317": "washing feet", 
+    "318": "arranging flowers", 
+    "319": "playing bass guitar", 
+    "320": "spraying", 
+    "321": "cutting pineapple", 
+    "322": "waxing chest", 
+    "323": "auctioning", 
+    "324": "jetskiing", 
+    "325": "drinking", 
+    "326": "busking", 
+    "327": "playing monopoly", 
+    "328": "salsa dancing", 
+    "329": "waxing eyebrows", 
+    "330": "watering plants", 
+    "331": "zumba", 
+    "332": "chopping wood", 
+    "333": "pushing wheelchair", 
+    "334": "carving pumpkin", 
+    "335": "building shed", 
+    "336": "making jewelry", 
+    "337": "catching or throwing softball", 
+    "338": "bending metal", 
+    "339": "ice skating", 
+    "340": "dancing charleston", 
+    "341": "abseiling", 
+    "342": "climbing a rope", 
+    "343": "crying", 
+    "344": "cleaning shoes", 
+    "345": "dancing ballet", 
+    "346": "driving tractor", 
+    "347": "triple jump", 
+    "348": "throwing ball", 
+    "349": "getting a haircut", 
+    "350": "running on treadmill", 
+    "351": "climbing ladder", 
+    "352": "blasting sand", 
+    "353": "playing trombone", 
+    "354": "drop kicking", 
+    "355": "country line dancing", 
+    "356": "changing wheel", 
+    "357": "feeding goats", 
+    "358": "tying knot (not on a tie)", 
+    "359": "setting table", 
+    "360": "shaving legs", 
+    "361": "kissing", 
+    "362": "riding mule", 
+    "363": "counting money", 
+    "364": "laying bricks", 
+    "365": "barbequing", 
+    "366": "news anchoring", 
+    "367": "smoking hookah", 
+    "368": "cooking egg", 
+    "369": "peeling apples", 
+    "370": "yoga", 
+    "371": "sharpening pencil", 
+    "372": "dribbling basketball", 
+    "373": "petting cat", 
+    "374": "playing ice hockey", 
+    "375": "milking cow", 
+    "376": "shining shoes", 
+    "377": "juggling soccer ball", 
+    "378": "scuba diving", 
+    "379": "playing squash or racquetball", 
+    "380": "drinking beer", 
+    "381": "sign language interpreting", 
+    "382": "playing basketball", 
+    "383": "breakdancing", 
+    "384": "testifying", 
+    "385": "making snowman", 
+    "386": "golf putting", 
+    "387": "playing didgeridoo", 
+    "388": "biking through snow", 
+    "389": "sailing", 
+    "390": "jumpstyle dancing", 
+    "391": "water sliding", 
+    "392": "grooming horse", 
+    "393": "massaging feet", 
+    "394": "playing paintball", 
+    "395": "making a cake", 
+    "396": "bowling", 
+    "397": "contact juggling", 
+    "398": "applying cream", 
+    "399": "playing badminton"
+}
\ No newline at end of file
diff --git a/mamba/.gitmodules b/mamba/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..a7445800fb64f3ae664c0b994a54235105986d2e
--- /dev/null
+++ b/mamba/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "3rdparty/lm-evaluation-harness"]
+	path = 3rdparty/lm-evaluation-harness
+	url = https://github.com/EleutherAI/lm-evaluation-harness/
diff --git a/mamba/AUTHORS b/mamba/AUTHORS
new file mode 100644
index 0000000000000000000000000000000000000000..38557a872f8d603ed963a05c211de7032de5926b
--- /dev/null
+++ b/mamba/AUTHORS
@@ -0,0 +1,2 @@
+Tri Dao, tri@tridao.me
+Albert Gu, agu@andrew.cmu.edu
diff --git a/mamba/LICENSE b/mamba/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f4abe24eb520fbb077753ae4f34bfaa43cb3b83f
--- /dev/null
+++ b/mamba/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 Tri Dao, Albert Gu
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/mamba/README.md b/mamba/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..754cefd7f862a90bad8fbdff71e3793a4e7849e3
--- /dev/null
+++ b/mamba/README.md
@@ -0,0 +1,149 @@
+# Mamba
+
+![Mamba](assets/selection.png "Selective State Space")
+> **Mamba: Linear-Time Sequence Modeling with Selective State Spaces**\
+> Albert Gu*, Tri Dao*\
+> Paper: https://arxiv.org/abs/2312.00752
+
+## About
+
+Mamba is a new state space model architecture showing promising performance on information-dense data such as language modeling, where previous subquadratic models fall short of Transformers.
+It is based on the line of progress on [structured state space models](https://github.com/state-spaces/s4),
+with an efficient hardware-aware design and implementation in the spirit of [FlashAttention](https://github.com/Dao-AILab/flash-attention).
+
+## Installation
+
+- `pip install causal-conv1d`: an efficient implementation of a simple causal Conv1d layer used inside the Mamba block.
+- `pip install mamba-ssm`: the core Mamba package.
+
+It can also be built from source with `pip install .` from this repository.
+
+If `pip` complains about PyTorch versions, try passing `--no-build-isolation` to `pip`.
+
+Other requirements:
+- Linux
+- NVIDIA GPU
+- PyTorch 1.12+
+- CUDA 11.6+
+
+## Usage
+
+We expose several levels of interface with the Mamba model.
+
+### Selective SSM
+
+Mamba is based on a selective SSM layer, which is the focus of the paper (Section 3; Algorithm 2).
+
+Source: [ops/selective_scan_interface.py](mamba_ssm/ops/selective_scan_interface.py).
+
+### Mamba Block
+
+The main module of this repository is the Mamba architecture block wrapping the selective SSM.
+
+Source: [modules/mamba_simple.py](mamba_ssm/modules/mamba_simple.py).
+
+Usage:
+```
+from mamba_ssm import Mamba
+
+batch, length, dim = 2, 64, 16
+x = torch.randn(batch, length, dim).to("cuda")
+model = Mamba(
+    # This module uses roughly 3 * expand * d_model^2 parameters
+    d_model=dim, # Model dimension d_model
+    d_state=16,  # SSM state expansion factor
+    d_conv=4,    # Local convolution width
+    expand=2,    # Block expansion factor
+).to("cuda")
+y = model(x)
+assert y.shape == x.shape
+```
+
+### Mamba Language Model
+
+Finally, we provide an example of a complete language model: a deep sequence model backbone (with repeating Mamba blocks) + language model head.
+
+Source: [models/mixer_seq_simple.py](mamba_ssm/models/mixer_seq_simple.py).
+
+This is an example of how to integrate Mamba into an end-to-end neural network.
+This example is used in the generation scripts below.
+
+
+
+## Pretrained Models
+
+Pretrained models are uploaded to
+[HuggingFace](https://huggingface.co/state-spaces): `mamba-130m`, `mamba-370m`,
+`mamba-790m`, `mamba-1.4b`, `mamba-2.8b`.
+
+The models will be autodownloaded by the generation script below.
+
+These models were trained on the [Pile](https://huggingface.co/datasets/EleutherAI/pile), and follow the standard model dimensions described by GPT-3 and followed by many open source models:
+
+| Parameters | Layers | Model dim. | 
+|------------|--------|------------|
+| 130M       | 12     | 768        |
+| 370M       | 24     | 1024       |
+| 790M       | 24     | 1536       |
+| 1.4B       | 24     | 2048       |
+| 2.8B       | 32     | 2560       |
+
+(The layer count of Mamba should be doubled, as two Mamba blocks are needed for each "layer" (MHA block + MLP block) of a Transformer.)
+
+Note: these are base models trained only for 300B tokens, without any form of downstream modification (instruction tuning, etc.).
+Performance is expected to be comparable or better than other architectures trained on similar data, but not to match larger or fine-tuned models.
+
+
+## Evaluations
+
+To run zero-shot evaluations of models (corresponding to Table 3 of the paper),
+we use the
+[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor)
+library.
+
+1. Pull the `lm-evaluation-harness` repo by `git submodule update --init
+   --recursive`. We use the `big-refactor` branch.
+2. Install `lm-evaluation-harness`: `pip install -e 3rdparty/lm-evaluation-harness`
+3. Run evaluation with (more documentation at the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) repo):
+```
+python evals/lm_harness_eval.py --model mamba --model_args pretrained=state-spaces/mamba-130m --tasks lambada_openai,hellaswag,piqa,arc_easy,arc_challenge,winogrande --device cuda --batch_size 64
+python evals/lm_harness_eval.py --model hf --model_args pretrained=EleutherAI/pythia-160m --tasks lambada_openai,hellaswag,piqa,arc_easy,arc_challenge,winogrande --device cuda --batch_size 64
+```
+
+Note that the result of each task might differ from reported values by 0.1-0.3 due to noise in the evaluation process.
+
+## Inference
+
+The script [benchmarks/benchmark_generation_mamba_simple.py](benchmarks/benchmark_generation_mamba_simple.py)
+1. autoloads a model from the HuggingFace Hub,
+2. generates completions of a user-specified prompt,
+3. benchmarks the inference speed of this generation.
+
+Other configurable options include the top-p (nucleus sampling) probability, and the softmax temperature.
+
+### Examples
+
+To test generation latency (e.g. batch size = 1) with different sampling strategies:
+
+```
+python benchmarks/benchmark_generation_mamba_simple.py --model-name "state-spaces/mamba-2.8b" --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.5
+python benchmarks/benchmark_generation_mamba_simple.py --model-name "EleutherAI/pythia-2.8b" --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.5
+```
+
+To test generation throughput with random prompts (e.g. large batch size):
+```
+python benchmarks/benchmark_generation_mamba_simple.py --model-name "state-spaces/mamba-2.8b" --batch 128
+python benchmarks/benchmark_generation_mamba_simple.py --model-name "EleutherAI/pythia-2.8b" --batch 128
+```
+
+## Citation
+
+If you use this codebase, or otherwise found our work valuable, please cite Mamba:
+```
+@article{mamba,
+  title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces},
+  author={Gu, Albert and Dao, Tri},
+  journal={arXiv preprint arXiv:2312.00752},
+  year={2023}
+}
+```
diff --git a/mamba/assets/selection.png b/mamba/assets/selection.png
new file mode 100644
index 0000000000000000000000000000000000000000..69b109a8eed4e3c7516b23e2b39d37e842a4464b
Binary files /dev/null and b/mamba/assets/selection.png differ
diff --git a/mamba/benchmarks/benchmark_generation_mamba_simple.py b/mamba/benchmarks/benchmark_generation_mamba_simple.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f2943cb4bde6f25eddb82b7b999c5c5f8b39acc
--- /dev/null
+++ b/mamba/benchmarks/benchmark_generation_mamba_simple.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2023, Tri Dao, Albert Gu.
+
+import argparse
+import time
+import json
+
+import torch
+import torch.nn.functional as F
+
+from einops import rearrange
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
+
+
+parser = argparse.ArgumentParser(description="Generation benchmarking")
+parser.add_argument("--model-name", type=str, default="state-spaces/mamba-130m")
+parser.add_argument("--prompt", type=str, default=None)
+parser.add_argument("--promptlen", type=int, default=100)
+parser.add_argument("--genlen", type=int, default=100)
+parser.add_argument("--temperature", type=float, default=1.0)
+parser.add_argument("--topk", type=int, default=1)
+parser.add_argument("--topp", type=float, default=1.0)
+parser.add_argument("--batch", type=int, default=1)
+args = parser.parse_args()
+
+repeats = 3
+device = "cuda"
+dtype = torch.float16
+
+print(f"Loading model {args.model_name}")
+is_mamba = args.model_name.startswith("state-spaces/mamba-") or "mamba" in args.model_name
+
+if is_mamba:
+    tokenizer = AutoTokenizer.from_pretrained("/home/zhulianghui/VisionProjects/mamba/ckpts/gpt-neox-20b-tokenizer")
+    model = MambaLMHeadModel.from_pretrained(args.model_name, device=device, dtype=dtype)
+else:
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    model = AutoModelForCausalLM.from_pretrained(args.model_name, device_map={"": device}, torch_dtype=dtype)
+model.eval()
+print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
+
+torch.random.manual_seed(0)
+if args.prompt is None:
+    input_ids = torch.randint(1, 1000, (args.batch, args.promptlen), dtype=torch.long, device="cuda")
+    attn_mask = torch.ones_like(input_ids, dtype=torch.long, device="cuda")
+else:
+    tokens = tokenizer(args.prompt, return_tensors="pt")
+    input_ids = tokens.input_ids.to(device=device)
+    attn_mask = tokens.attention_mask.to(device=device)
+max_length = input_ids.shape[1] + args.genlen
+
+if is_mamba:
+    fn = lambda: model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        cg=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=False,
+        temperature=args.temperature,
+        top_k=args.topk,
+        top_p=args.topp,
+    )
+else:
+    fn = lambda: model.generate(
+        input_ids=input_ids,
+        attention_mask=attn_mask,
+        max_length=max_length,
+        return_dict_in_generate=True,
+        pad_token_id=tokenizer.eos_token_id,
+        do_sample=True,
+        temperature=args.temperature,
+        top_k=args.topk,
+        top_p=args.topp,
+    )
+out = fn()
+if args.prompt is not None:
+    print(tokenizer.batch_decode(out.sequences.tolist()))
+
+torch.cuda.synchronize()
+start = time.time()
+for _ in range(repeats):
+    fn()
+torch.cuda.synchronize()
+print(f"Prompt length: {len(input_ids[0])}, generation length: {len(out.sequences[0]) - len(input_ids[0])}")
+print(f"{args.model_name} prompt processing + decoding time: {(time.time() - start) / repeats * 1000:.0f}ms")
diff --git a/mamba/csrc/selective_scan/reverse_scan.cuh b/mamba/csrc/selective_scan/reverse_scan.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d7e93174bb391d45271e6c77669a5e52d6c9cc78
--- /dev/null
+++ b/mamba/csrc/selective_scan/reverse_scan.cuh
@@ -0,0 +1,401 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/block/block_raking_layout.cuh>
+// #include <cub/detail/uninitialized_copy.cuh>
+#include "uninitialized_copy.cuh"
+
+/**
+ * Perform a reverse sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReverseReduce(const T (&input)[LENGTH], ReductionOp reduction_op) {
+    static_assert(LENGTH > 0);
+    T retval = input[LENGTH - 1];
+    #pragma unroll
+    for (int i = LENGTH - 2; i >= 0; --i) { retval = reduction_op(retval, input[i]); }
+    return retval;
+}
+
+/**
+ * Perform a sequential inclusive postfix reverse scan over the statically-sized \p input array, seeded with the specified \p postfix.  The aggregate is returned.
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadReverseScanInclusive(
+    const T (&input)[LENGTH],
+    T (&output)[LENGTH],
+    ScanOp scan_op,
+    const T postfix)
+{
+    T inclusive = postfix;
+    #pragma unroll
+    for (int i = LENGTH - 1; i >= 0; --i) {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+}
+
+/**
+ * Perform a sequential exclusive postfix reverse scan over the statically-sized \p input array, seeded with the specified \p postfix.  The aggregate is returned.
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadReverseScanExclusive(
+    const T (&input)[LENGTH],
+    T (&output)[LENGTH],
+    ScanOp scan_op,
+    const T postfix)
+{
+    // Careful, output maybe be aliased to input
+    T exclusive = postfix;
+    T inclusive;
+    #pragma unroll
+    for (int i = LENGTH - 1; i >= 0; --i) {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+    return inclusive;
+}
+
+
+/**
+ * \brief WarpReverseScan provides SHFL-based variants of parallel postfix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS    ///< Number of threads per logical warp
+    >
+struct WarpReverseScan {
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    /// Whether the logical warp size and the PTX warp size coincide
+    static constexpr bool IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0));
+    /// The number of warp scan steps
+    static constexpr int STEPS = cub::Log2<LOGICAL_WARP_THREADS>::VALUE;
+    static_assert(LOGICAL_WARP_THREADS == 1 << STEPS);
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    explicit __device__ __forceinline__
+    WarpReverseScan()
+        : lane_id(cub::LaneId())
+        , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
+        , member_mask(cub::WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+    {
+        if (!IS_ARCH_WARP) {
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+        }
+    }
+
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return cub::ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+    }
+
+
+    /// Inclusive scan
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveReverseScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++) {
+            int offset = 1 << STEP;
+            T temp = cub::ShuffleDown<LOGICAL_WARP_THREADS>(
+                inclusive_output, offset, LOGICAL_WARP_THREADS - 1, member_mask
+            );
+            // Perform scan op if from a valid peer
+            inclusive_output = static_cast<int>(lane_id) >= LOGICAL_WARP_THREADS - offset
+                ? inclusive_output : scan_op(temp, inclusive_output);
+        }
+    }
+
+    /// Exclusive scan
+    // Get exclusive from inclusive
+    template <typename ScanOpT>
+    __device__ __forceinline__ void ExclusiveReverseScan(
+        T              input,              ///< [in] Calling thread's input item.
+        T              &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT        scan_op,            ///< [in] Binary scan operator
+        T              &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T inclusive_output;
+        InclusiveReverseScan(input, inclusive_output, scan_op);
+        warp_aggregate = cub::ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, 0, member_mask);
+        // initial value unknown
+        exclusive_output = cub::ShuffleDown<LOGICAL_WARP_THREADS>(
+            inclusive_output, 1, LOGICAL_WARP_THREADS - 1, member_mask
+        );
+    }
+
+    /**
+     * \brief Computes both inclusive and exclusive reverse scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for the last <em>warp-lane</em> is undefined.
+     */
+    template <typename ScanOpT>
+    __device__ __forceinline__ void ReverseScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveReverseScan(input, inclusive_output, scan_op);
+        // initial value unknown
+        exclusive_output = cub::ShuffleDown<LOGICAL_WARP_THREADS>(
+            inclusive_output, 1, LOGICAL_WARP_THREADS - 1, member_mask
+        );
+    }
+
+};
+
+/**
+ * \brief BlockReverseScan provides variants of raking-based parallel postfix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    bool        MEMOIZE=false   ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    >
+struct BlockReverseScan {
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    /// The thread block size in threads
+    static constexpr int BLOCK_THREADS = BLOCK_DIM_X;
+
+    /// Layout type for padded thread block raking grid
+    using BlockRakingLayout = cub::BlockRakingLayout<T, BLOCK_THREADS>;
+    // The number of reduction elements is not a multiple of the number of raking threads for now
+    static_assert(BlockRakingLayout::UNGUARDED);
+
+    /// Number of raking threads
+    static constexpr int RAKING_THREADS = BlockRakingLayout::RAKING_THREADS;
+    /// Number of raking elements per warp synchronous raking thread
+    static constexpr int SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH;
+    /// Cooperative work can be entirely warp synchronous
+    static constexpr bool WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS));
+
+    ///  WarpReverseScan utility type
+    using WarpReverseScan = WarpReverseScan<T, RAKING_THREADS>;
+
+    /// Shared memory storage layout type
+    struct _TempStorage {
+        typename BlockRakingLayout::TempStorage raking_grid;     ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : cub::Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(ScanOp scan_op) {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+        // Read data into registers
+        #pragma unroll
+        for (int i = 0; i < SEGMENT_LENGTH; ++i) { cached_segment[i] = smem_raking_ptr[i]; }
+        T raking_partial = cached_segment[SEGMENT_LENGTH - 1];
+        #pragma unroll
+        for (int i = SEGMENT_LENGTH - 2; i >= 0; --i) {
+            raking_partial = scan_op(raking_partial, cached_segment[i]);
+        }
+        return raking_partial;
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+        // Read data back into registers
+        if (!MEMOIZE) {
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; ++i) { cached_segment[i] = smem_raking_ptr[i]; }
+        }
+        ThreadReverseScanExclusive(cached_segment, cached_segment, scan_op, raking_partial);
+        // Write data back to smem
+        #pragma unroll
+        for (int i = 0; i < SEGMENT_LENGTH; ++i) { smem_raking_ptr[i] = cached_segment[i]; }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockReverseScan(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(cub::RowMajorTid(BLOCK_DIM_X, 1, 1))
+    {}
+
+
+    /// Computes an exclusive thread block-wide postfix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_postfix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically postfixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPostfixCallbackOp>
+    __device__ __forceinline__ void ExclusiveReverseScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPostfixCallbackOp  &block_postfix_callback_op)     ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide postfix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS) {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpReverseScan warp_scan;
+            warp_scan.ExclusiveReverseScan(input, exclusive_output, scan_op, block_aggregate);
+            // Obtain warp-wide postfix in lane0, then broadcast to other lanes
+            T block_postfix = block_postfix_callback_op(block_aggregate);
+            block_postfix = warp_scan.Broadcast(block_postfix, 0);
+            exclusive_output = linear_tid == BLOCK_THREADS - 1 ? block_postfix : scan_op(block_postfix, exclusive_output);
+        } else {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            detail::uninitialized_copy(placement_ptr, input);
+            cub::CTA_SYNC();
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS) {
+                WarpReverseScan warp_scan;
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveReverseScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+                // Obtain block-wide postfix in lane0, then broadcast to other lanes
+                T block_postfix = block_postfix_callback_op(block_aggregate);
+                block_postfix = warp_scan.Broadcast(block_postfix, 0);
+                // Update postfix with warpscan exclusive partial
+                T downsweep_postfix = linear_tid == RAKING_THREADS - 1
+                    ? block_postfix : scan_op(block_postfix, exclusive_partial);
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_postfix);
+            }
+            cub::CTA_SYNC();
+            // Grab thread postfix from shared memory
+            exclusive_output = *placement_ptr;
+
+            // // Compute warp scan in each warp.
+            // // The exclusive output from the last lane in each warp is invalid.
+            // T inclusive_output;
+            // WarpReverseScan warp_scan;
+            // warp_scan.ReverseScan(input, inclusive_output, exclusive_output, scan_op);
+
+            // // Compute the warp-wide postfix and block-wide aggregate for each warp.  Warp postfix for the last warp is invalid.
+            // T block_aggregate;
+            // T warp_postfix = ComputeWarpPostfix(scan_op, inclusive_output, block_aggregate);
+
+            // // Apply warp postfix to our lane's partial
+            // if (warp_id != 0) {
+            //     exclusive_output = scan_op(warp_postfix, exclusive_output);
+            //     if (lane_id == 0) { exclusive_output = warp_postfix; }
+            // }
+
+            // // Use the first warp to determine the thread block postfix, returning the result in lane0
+            // if (warp_id == 0) {
+            //     T block_postfix = block_postfix_callback_op(block_aggregate);
+            //     if (lane_id == 0) {
+            //         // Share the postfix with all threads
+            //         detail::uninitialized_copy(&temp_storage.block_postfix,
+            //                                   block_postfix);
+
+            //         exclusive_output = block_postfix; // The block postfix is the exclusive output for tid0
+            //     }
+            // }
+
+            // cub::CTA_SYNC();
+
+            // // Incorporate thread block postfix into outputs
+            // T block_postfix = temp_storage.block_postfix;
+            // if (linear_tid > 0) { exclusive_output = scan_op(block_postfix, exclusive_output); }
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide postfix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_postfix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically postfixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPostfixCallbackOp>
+    __device__ __forceinline__ void InclusiveReverseScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPostfixCallbackOp   &block_postfix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide postfix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_postfix = ThreadReverseReduce(input, scan_op);
+        // Exclusive thread block-scan
+        ExclusiveReverseScan(thread_postfix, thread_postfix, scan_op, block_postfix_callback_op);
+        // Inclusive scan in registers with postfix as seed
+        ThreadReverseScanInclusive(input, output, scan_op, thread_postfix);
+    }
+
+};
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan.cpp b/mamba/csrc/selective_scan/selective_scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f51af402a190dc14247ef8185a7d01b697313f02
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan.cpp
@@ -0,0 +1,497 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include "selective_scan.h"
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+#define DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)                    \
+    if (ITYPE == at::ScalarType::Half) {                                            \
+        using input_t = at::Half;                                                   \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
+        using input_t = at::BFloat16;                                               \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::Float)  {                                   \
+        using input_t = float;                                                      \
+        __VA_ARGS__();                                                              \
+    } else {                                                                        \
+        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
+    }
+
+#define DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(WTYPE, NAME, ...)                     \
+    if (WTYPE == at::ScalarType::Half) {                                             \
+        using weight_t = at::Half;                                                   \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::BFloat16) {                                  \
+        using weight_t = at::BFloat16;                                               \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::Float)  {                                    \
+        using weight_t = float;                                                      \
+        __VA_ARGS__();                                                               \
+    } else {                                                                         \
+        AT_ERROR(#NAME, " not implemented for weight type '", toString(WTYPE), "'"); \
+    }
+
+#define DISPATCH_WTYPE_FLOAT_AND_COMPLEX(WTYPE, NAME, ...)                           \
+    if (WTYPE == at::ScalarType::Float) {                                            \
+       using weight_t = float;                                                       \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::ComplexFloat) {                              \
+        using weight_t = c10::complex<float>;                                        \
+        __VA_ARGS__();                                                               \
+    } else {                                                                         \
+        AT_ERROR(#NAME, " not implemented for weight type '", toString(WTYPE), "'"); \
+    }
+
+template<typename input_t, typename weight_t>
+void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
+
+template <typename input_t, typename weight_t>
+void selective_scan_bwd_cuda(SSMParamsBwd &params, cudaStream_t stream);
+
+void set_ssm_params_fwd(SSMParamsBase &params,
+                        // sizes
+                        const size_t batch,
+                        const size_t dim,
+                        const size_t seqlen,
+                        const size_t dstate,
+                        const size_t n_groups,
+                        const size_t n_chunks,
+                        const bool is_variable_B,
+                        const bool is_variable_C,
+                        // device pointers
+                        const at::Tensor u,
+                        const at::Tensor delta,
+                        const at::Tensor A,
+                        const at::Tensor B,
+                        const at::Tensor C,
+                        const at::Tensor out,
+                        const at::Tensor z,
+                        const at::Tensor out_z,
+                        void* D_ptr,
+                        void* delta_bias_ptr,
+                        void* x_ptr,
+                        bool has_z,
+                        bool delta_softplus) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.batch = batch;
+    params.dim = dim;
+    params.seqlen = seqlen;
+    params.dstate = dstate;
+    params.n_groups = n_groups;
+    params.n_chunks = n_chunks;
+    params.dim_ngroups_ratio = dim / n_groups;
+
+    params.delta_softplus = delta_softplus;
+
+    params.is_variable_B = is_variable_B;
+    params.is_variable_C = is_variable_C;
+
+    // Set the pointers and strides.
+    params.u_ptr = u.data_ptr();
+    params.delta_ptr = delta.data_ptr();
+    params.A_ptr = A.data_ptr();
+    params.B_ptr = B.data_ptr();
+    params.C_ptr = C.data_ptr();
+    params.D_ptr = D_ptr;
+    params.delta_bias_ptr = delta_bias_ptr;
+    params.out_ptr = out.data_ptr();
+    params.x_ptr = x_ptr;
+    params.z_ptr = has_z ? z.data_ptr() : nullptr;
+    params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
+    // All stride are in elements, not bytes.
+    params.A_d_stride = A.stride(0);
+    params.A_dstate_stride = A.stride(1);
+    if (!is_variable_B) {
+        params.B_d_stride = B.stride(0);
+    } else {
+        params.B_batch_stride = B.stride(0);
+        params.B_group_stride = B.stride(1);
+    }
+    params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
+    if (!is_variable_C) {
+        params.C_d_stride = C.stride(0);
+    } else {
+        params.C_batch_stride = C.stride(0);
+        params.C_group_stride = C.stride(1);
+    }
+    params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
+    params.u_batch_stride = u.stride(0);
+    params.u_d_stride = u.stride(1);
+    params.delta_batch_stride = delta.stride(0);
+    params.delta_d_stride = delta.stride(1);
+    if (has_z) {
+        params.z_batch_stride = z.stride(0);
+        params.z_d_stride = z.stride(1);
+        params.out_z_batch_stride = out_z.stride(0);
+        params.out_z_d_stride = out_z.stride(1);
+    }
+    params.out_batch_stride = out.stride(0);
+    params.out_d_stride = out.stride(1);
+}
+
+void set_ssm_params_bwd(SSMParamsBwd &params,
+                        // sizes
+                        const size_t batch,
+                        const size_t dim,
+                        const size_t seqlen,
+                        const size_t dstate,
+                        const size_t n_groups,
+                        const size_t n_chunks,
+                        const bool is_variable_B,
+                        const bool is_variable_C,
+                        // device pointers
+                        const at::Tensor u,
+                        const at::Tensor delta,
+                        const at::Tensor A,
+                        const at::Tensor B,
+                        const at::Tensor C,
+                        const at::Tensor z,
+                        const at::Tensor out,
+                        const at::Tensor out_z,
+                        void* D_ptr,
+                        void* delta_bias_ptr,
+                        void* x_ptr,
+                        const at::Tensor dout,
+                        const at::Tensor du,
+                        const at::Tensor ddelta,
+                        const at::Tensor dA,
+                        const at::Tensor dB,
+                        const at::Tensor dC,
+                        const at::Tensor dz,
+                        void* dD_ptr,
+                        void* ddelta_bias_ptr,
+                        bool has_z,
+                        bool delta_softplus,
+                        bool recompute_out_z) {
+    // Pass in "dout" instead of "out", we're not gonna use "out" unless we have z
+    set_ssm_params_fwd(params, batch, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
+                       u, delta, A, B, C, has_z ? out : dout,
+                       has_z ? z : dout,
+                       // If not recompute_out_z, pass dout instead of out_z.
+                       // This won't be used by the bwd kernel
+                       recompute_out_z ? out_z : dout,
+                       D_ptr, delta_bias_ptr, x_ptr, has_z, delta_softplus);
+    if (!recompute_out_z) { params.out_z_ptr = nullptr; }
+
+    // Set the pointers and strides.
+    params.dout_ptr = dout.data_ptr();
+    params.du_ptr = du.data_ptr();
+    params.dA_ptr = dA.data_ptr();
+    params.dB_ptr = dB.data_ptr();
+    params.dC_ptr = dC.data_ptr();
+    params.dD_ptr = dD_ptr;
+    params.ddelta_ptr = ddelta.data_ptr();
+    params.ddelta_bias_ptr = ddelta_bias_ptr;
+    params.dz_ptr = has_z ? dz.data_ptr() : nullptr;
+    // All stride are in elements, not bytes.
+    params.dout_batch_stride = dout.stride(0);
+    params.dout_d_stride = dout.stride(1);
+    params.dA_d_stride = dA.stride(0);
+    params.dA_dstate_stride = dA.stride(1);
+    if (!is_variable_B) {
+        params.dB_d_stride = dB.stride(0);
+    } else {
+        params.dB_batch_stride = dB.stride(0);
+        params.dB_group_stride = dB.stride(1);
+    }
+    params.dB_dstate_stride = !is_variable_B ? dB.stride(1) : dB.stride(2);
+    if (!is_variable_C) {
+        params.dC_d_stride = dC.stride(0);
+    } else {
+        params.dC_batch_stride = dC.stride(0);
+        params.dC_group_stride = dC.stride(1);
+    }
+    params.dC_dstate_stride = !is_variable_C ? dC.stride(1) : dC.stride(2);
+    params.du_batch_stride = du.stride(0);
+    params.du_d_stride = du.stride(1);
+    params.ddelta_batch_stride = ddelta.stride(0);
+    params.ddelta_d_stride = ddelta.stride(1);
+    if (has_z) {
+        params.dz_batch_stride = dz.stride(0);
+        params.dz_d_stride = dz.stride(1);
+    }
+}
+
+std::vector<at::Tensor>
+selective_scan_fwd(const at::Tensor &u, const at::Tensor &delta,
+                  const at::Tensor &A, const at::Tensor &B, const at::Tensor &C,
+                  const c10::optional<at::Tensor> &D_,
+                  const c10::optional<at::Tensor> &z_,
+                  const c10::optional<at::Tensor> &delta_bias_,
+                  bool delta_softplus) {
+    auto input_type = u.scalar_type();
+    auto weight_type = A.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::ComplexFloat);
+
+    const bool is_variable_B = B.dim() >= 3;
+    const bool is_variable_C = C.dim() >= 3;
+    const bool is_complex = weight_type == at::ScalarType::ComplexFloat;
+
+    TORCH_CHECK(delta.scalar_type() == input_type);
+    TORCH_CHECK(B.scalar_type() == (!is_variable_B ? weight_type : input_type));
+    TORCH_CHECK(C.scalar_type() == (!is_variable_C ? weight_type : input_type));
+
+    TORCH_CHECK(u.is_cuda());
+    TORCH_CHECK(delta.is_cuda());
+    TORCH_CHECK(A.is_cuda());
+    TORCH_CHECK(B.is_cuda());
+    TORCH_CHECK(C.is_cuda());
+
+    TORCH_CHECK(u.stride(-1) == 1);
+    TORCH_CHECK(delta.stride(-1) == 1);
+
+    const auto sizes = u.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int dstate = A.size(1);
+    const int n_groups = is_variable_B ? B.size(1) : 1;
+
+    TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
+
+    CHECK_SHAPE(u, batch_size, dim, seqlen);
+    CHECK_SHAPE(delta, batch_size, dim, seqlen);
+    CHECK_SHAPE(A, dim, dstate);
+    if (!is_variable_B) {
+        CHECK_SHAPE(B, dim, dstate);
+    } else {
+        CHECK_SHAPE(B, batch_size, n_groups, dstate, !is_complex ? seqlen : seqlen * 2);
+        TORCH_CHECK(B.stride(-1) == 1);
+    }
+    if (!is_variable_C) {
+        CHECK_SHAPE(C, dim, dstate);
+    } else {
+        CHECK_SHAPE(C, batch_size, n_groups, dstate, !is_complex ? seqlen: seqlen * 2);
+        TORCH_CHECK(C.stride(-1) == 1);
+    }
+
+    if (D_.has_value()) {
+        auto D = D_.value();
+        TORCH_CHECK(D.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(D.is_cuda());
+        TORCH_CHECK(D.stride(-1) == 1);
+        CHECK_SHAPE(D, dim);
+    }
+
+    if (delta_bias_.has_value()) {
+        auto delta_bias = delta_bias_.value();
+        TORCH_CHECK(delta_bias.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(delta_bias.is_cuda());
+        TORCH_CHECK(delta_bias.stride(-1) == 1);
+        CHECK_SHAPE(delta_bias, dim);
+    }
+
+    at::Tensor z, out_z;
+    const bool has_z = z_.has_value();
+    if (has_z) {
+        z = z_.value();
+        TORCH_CHECK(z.scalar_type() == input_type);
+        TORCH_CHECK(z.is_cuda());
+        TORCH_CHECK(z.stride(-1) == 1);
+        CHECK_SHAPE(z, batch_size, dim, seqlen);
+        out_z = torch::empty_like(z);
+    }
+
+    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+    // const int n_chunks = (seqlen + 1024 - 1) / 1024;
+    // at::Tensor out = torch::empty_like(u);
+    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
+    at::Tensor out = torch::empty_like(delta);
+    at::Tensor x;
+    x = torch::empty({batch_size, dim, n_chunks, dstate * 2}, u.options().dtype(weight_type));
+
+    SSMParamsBase params;
+    set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
+                       u, delta, A, B, C, out, z, out_z,
+                       D_.has_value() ? D_.value().data_ptr() : nullptr,
+                       delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
+                       x.data_ptr(),
+                       has_z,
+                       delta_softplus);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)u.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
+        DISPATCH_WTYPE_FLOAT_AND_COMPLEX(A.scalar_type(), "selective_scan_fwd", [&] {
+            selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+        });
+    });
+    std::vector<at::Tensor> result = {out, x};
+    if (has_z) { result.push_back(out_z); }
+    return result;
+}
+
+std::vector<at::Tensor>
+selective_scan_bwd(const at::Tensor &u, const at::Tensor &delta,
+                  const at::Tensor &A, const at::Tensor &B, const at::Tensor &C,
+                  const c10::optional<at::Tensor> &D_,
+                  const c10::optional<at::Tensor> &z_,
+                  const c10::optional<at::Tensor> &delta_bias_,
+                  const at::Tensor &dout,
+                  const c10::optional<at::Tensor> &x_,
+                  const c10::optional<at::Tensor> &out_,
+                  c10::optional<at::Tensor> &dz_,
+                  bool delta_softplus,
+                  bool recompute_out_z) {
+    auto input_type = u.scalar_type();
+    auto weight_type = A.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::ComplexFloat);
+
+    const bool is_variable_B = B.dim() >= 3;
+    const bool is_variable_C = C.dim() >= 3;
+    const bool is_complex = weight_type == at::ScalarType::ComplexFloat;
+
+    TORCH_CHECK(delta.scalar_type() == input_type);
+    TORCH_CHECK(B.scalar_type() == (!is_variable_B ? weight_type : input_type));
+    TORCH_CHECK(C.scalar_type() == (!is_variable_C ? weight_type : input_type));
+    TORCH_CHECK(dout.scalar_type() == input_type);
+
+    TORCH_CHECK(u.is_cuda());
+    TORCH_CHECK(delta.is_cuda());
+    TORCH_CHECK(A.is_cuda());
+    TORCH_CHECK(B.is_cuda());
+    TORCH_CHECK(C.is_cuda());
+    TORCH_CHECK(dout.is_cuda());
+
+    TORCH_CHECK(u.stride(-1) == 1);
+    TORCH_CHECK(delta.stride(-1) == 1);
+    TORCH_CHECK(dout.stride(-1) == 1);
+
+    const auto sizes = u.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int dstate = A.size(1);
+    const int n_groups = is_variable_B ? B.size(1) : 1;
+
+    TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
+
+    CHECK_SHAPE(u, batch_size, dim, seqlen);
+    CHECK_SHAPE(delta, batch_size, dim, seqlen);
+    CHECK_SHAPE(A, dim, dstate);
+    if (!is_variable_B) {
+        CHECK_SHAPE(B, dim, dstate);
+    } else {
+        CHECK_SHAPE(B, batch_size, n_groups, dstate, !is_complex ? seqlen : seqlen * 2);
+        TORCH_CHECK(B.stride(-1) == 1);
+    }
+    if (!is_variable_C) {
+        CHECK_SHAPE(C, dim, dstate);
+    } else {
+        CHECK_SHAPE(C, batch_size, n_groups, dstate, !is_complex ? seqlen: seqlen * 2);
+        TORCH_CHECK(C.stride(-1) == 1);
+    }
+    CHECK_SHAPE(dout, batch_size, dim, seqlen);
+
+    if (D_.has_value()) {
+        auto D = D_.value();
+        TORCH_CHECK(D.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(D.is_cuda());
+        TORCH_CHECK(D.stride(-1) == 1);
+        CHECK_SHAPE(D, dim);
+    }
+
+    if (delta_bias_.has_value()) {
+        auto delta_bias = delta_bias_.value();
+        TORCH_CHECK(delta_bias.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(delta_bias.is_cuda());
+        TORCH_CHECK(delta_bias.stride(-1) == 1);
+        CHECK_SHAPE(delta_bias, dim);
+    }
+
+    at::Tensor z, out, dz, out_z;
+    const bool has_z = z_.has_value();
+    if (has_z) {
+        z = z_.value();
+        TORCH_CHECK(z.scalar_type() == input_type);
+        TORCH_CHECK(z.is_cuda());
+        TORCH_CHECK(z.stride(-1) == 1);
+        CHECK_SHAPE(z, batch_size, dim, seqlen);
+
+        TORCH_CHECK(out_.has_value());
+        out = out_.value();
+        TORCH_CHECK(out.scalar_type() == input_type);
+        TORCH_CHECK(out.is_cuda());
+        TORCH_CHECK(out.stride(-1) == 1);
+        CHECK_SHAPE(out, batch_size, dim, seqlen);
+
+        if (dz_.has_value()) {
+            dz = dz_.value();
+            TORCH_CHECK(dz.scalar_type() == input_type);
+            TORCH_CHECK(dz.is_cuda());
+            TORCH_CHECK(dz.stride(-1) == 1);
+            CHECK_SHAPE(dz, batch_size, dim, seqlen);
+        } else {
+            dz = torch::empty_like(z);
+        }
+        if (recompute_out_z) {
+            out_z = torch::empty_like(out);
+        }
+    }
+
+    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+    // const int n_chunks = (seqlen + 1024 - 1) / 1024;
+    if (n_chunks > 1) { TORCH_CHECK(x_.has_value()); }
+    if (x_.has_value()) {
+        auto x = x_.value();
+        TORCH_CHECK(x.scalar_type() == weight_type);
+        TORCH_CHECK(x.is_cuda());
+        TORCH_CHECK(x.is_contiguous());
+        CHECK_SHAPE(x, batch_size, dim, n_chunks, 2 * dstate);
+    }
+
+    at::Tensor du = torch::empty_like(u);
+    at::Tensor ddelta = torch::empty_like(delta);
+    at::Tensor dA = torch::zeros_like(A);
+    at::Tensor dB = !is_variable_B ? torch::zeros_like(B) : torch::zeros_like(B, B.options().dtype(torch::kFloat32));
+    at::Tensor dC = !is_variable_C ? torch::zeros_like(C) : torch::zeros_like(C, C.options().dtype(torch::kFloat32));
+    at::Tensor dD;
+    if (D_.has_value()) { dD = torch::zeros_like(D_.value()); }
+    at::Tensor ddelta_bias;
+    if (delta_bias_.has_value()) { ddelta_bias = torch::zeros_like(delta_bias_.value()); }
+
+    SSMParamsBwd params;
+    set_ssm_params_bwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
+                       u, delta, A, B, C, z, out, out_z,
+                       D_.has_value() ? D_.value().data_ptr() : nullptr,
+                       delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
+                       x_.has_value() ? x_.value().data_ptr() : nullptr,
+                       dout, du, ddelta, dA, dB, dC, dz,
+                       D_.has_value() ? dD.data_ptr() : nullptr,
+                       delta_bias_.has_value() ? ddelta_bias.data_ptr() : nullptr,
+                       has_z, delta_softplus, recompute_out_z);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)u.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_bwd", [&] {
+        DISPATCH_WTYPE_FLOAT_AND_COMPLEX(A.scalar_type(), "selective_scan_bwd", [&] {
+            selective_scan_bwd_cuda<input_t, weight_t>(params, stream);
+        });
+    });
+    std::vector<at::Tensor> result = {du, ddelta, dA, dB.to(B.dtype()), dC.to(C.dtype()), dD, ddelta_bias};
+    if (has_z) { result.push_back(dz); }
+    if (recompute_out_z) { result.push_back(out_z); }
+    return result;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("fwd", &selective_scan_fwd, "Selective scan forward");
+    m.def("bwd", &selective_scan_bwd, "Selective scan backward");
+}
diff --git a/mamba/csrc/selective_scan/selective_scan.h b/mamba/csrc/selective_scan/selective_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2c7bcdbd5ddadc5975caa641ecb1dcd3b73dafd
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan.h
@@ -0,0 +1,101 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SSMScanParamsBase {
+    using index_t = uint32_t;
+
+    int batch, seqlen, n_chunks;
+    index_t a_batch_stride;
+    index_t b_batch_stride;
+    index_t out_batch_stride;
+
+    // Common data pointers.
+    void *__restrict__ a_ptr;
+    void *__restrict__ b_ptr;
+    void *__restrict__ out_ptr;
+    void *__restrict__ x_ptr;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SSMParamsBase {
+    using index_t = uint32_t;
+
+    int batch, dim, seqlen, dstate, n_groups, n_chunks;
+    int dim_ngroups_ratio;
+    bool is_variable_B;
+    bool is_variable_C;
+
+    bool delta_softplus;
+
+    index_t A_d_stride;
+    index_t A_dstate_stride;
+    index_t B_batch_stride;
+    index_t B_d_stride;
+    index_t B_dstate_stride;
+    index_t B_group_stride;
+    index_t C_batch_stride;
+    index_t C_d_stride;
+    index_t C_dstate_stride;
+    index_t C_group_stride;
+    index_t u_batch_stride;
+    index_t u_d_stride;
+    index_t delta_batch_stride;
+    index_t delta_d_stride;
+    index_t z_batch_stride;
+    index_t z_d_stride;
+    index_t out_batch_stride;
+    index_t out_d_stride;
+    index_t out_z_batch_stride;
+    index_t out_z_d_stride;
+
+    // Common data pointers.
+    void *__restrict__ A_ptr;
+    void *__restrict__ B_ptr;
+    void *__restrict__ C_ptr;
+    void *__restrict__ D_ptr;
+    void *__restrict__ u_ptr;
+    void *__restrict__ delta_ptr;
+    void *__restrict__ delta_bias_ptr;
+    void *__restrict__ out_ptr;
+    void *__restrict__ x_ptr;
+    void *__restrict__ z_ptr;
+    void *__restrict__ out_z_ptr;
+};
+
+struct SSMParamsBwd: public SSMParamsBase {
+    index_t dout_batch_stride;
+    index_t dout_d_stride;
+    index_t dA_d_stride;
+    index_t dA_dstate_stride;
+    index_t dB_batch_stride;
+    index_t dB_group_stride;
+    index_t dB_d_stride;
+    index_t dB_dstate_stride;
+    index_t dC_batch_stride;
+    index_t dC_group_stride;
+    index_t dC_d_stride;
+    index_t dC_dstate_stride;
+    index_t du_batch_stride;
+    index_t du_d_stride;
+    index_t dz_batch_stride;
+    index_t dz_d_stride;
+    index_t ddelta_batch_stride;
+    index_t ddelta_d_stride;
+
+    // Common data pointers.
+    void *__restrict__ dout_ptr;
+    void *__restrict__ dA_ptr;
+    void *__restrict__ dB_ptr;
+    void *__restrict__ dC_ptr;
+    void *__restrict__ dD_ptr;
+    void *__restrict__ du_ptr;
+    void *__restrict__ dz_ptr;
+    void *__restrict__ ddelta_ptr;
+    void *__restrict__ ddelta_bias_ptr;
+};
diff --git a/mamba/csrc/selective_scan/selective_scan_bwd_bf16_complex.cu b/mamba/csrc/selective_scan/selective_scan_bwd_bf16_complex.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c55f0e858af4ebd246a5d251308ab920b4e01a50
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_bwd_bf16_complex.cu
@@ -0,0 +1,9 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::BFloat16, complex_t>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_bwd_bf16_real.cu b/mamba/csrc/selective_scan/selective_scan_bwd_bf16_real.cu
new file mode 100644
index 0000000000000000000000000000000000000000..72adaf5cb13c6429e2f345a0a823c6bc3722b95a
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_bwd_bf16_real.cu
@@ -0,0 +1,9 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::BFloat16, float>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_bwd_fp16_complex.cu b/mamba/csrc/selective_scan/selective_scan_bwd_fp16_complex.cu
new file mode 100644
index 0000000000000000000000000000000000000000..df126d7c8d5f9f0862273d2fe21ea15b35757b64
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_bwd_fp16_complex.cu
@@ -0,0 +1,9 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::Half, complex_t>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_bwd_fp16_real.cu b/mamba/csrc/selective_scan/selective_scan_bwd_fp16_real.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3ff271b50eaff208ae33c16c87ab7aaee76dfd76
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_bwd_fp16_real.cu
@@ -0,0 +1,9 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::Half, float>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_bwd_fp32_complex.cu b/mamba/csrc/selective_scan/selective_scan_bwd_fp32_complex.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5554902342785b289b81c060a71a51734fc1e6bf
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_bwd_fp32_complex.cu
@@ -0,0 +1,9 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<float, complex_t>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_bwd_fp32_real.cu b/mamba/csrc/selective_scan/selective_scan_bwd_fp32_real.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a7ed642231da80c455c0499702cc8a1cb4536ec2
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_bwd_fp32_real.cu
@@ -0,0 +1,9 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<float, float>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_bwd_kernel.cuh b/mamba/csrc/selective_scan/selective_scan_bwd_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2ed101148a4b32560111e5a832fc8b5881a4b243
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_bwd_kernel.cuh
@@ -0,0 +1,531 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+#include <ATen/cuda/Atomic.cuh>  // For atomicAdd on complex
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_reduce.cuh>
+
+#include "selective_scan.h"
+#include "selective_scan_common.h"
+#include "reverse_scan.cuh"
+#include "static_switch.h"
+
+template<typename scalar_t> __device__ __forceinline__ scalar_t conj(scalar_t x);
+template<> __device__ __forceinline__ float conj<float>(float x) { return x; }
+template<> __device__ __forceinline__ complex_t conj<complex_t>(complex_t x) { return std::conj(x); }
+
+template<int kNThreads_, int kNItems_, bool kIsEvenLen_, bool kIsVariableB_, bool kIsVariableC_,
+         bool kDeltaSoftplus_, bool kHasZ_, typename input_t_, typename weight_t_>
+struct Selective_Scan_bwd_kernel_traits {
+    static_assert(kNItems_ % 4 == 0);
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kNItems = kNItems_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : std::min(8, kNItems);
+    static_assert(kNItems % kNElts == 0);
+    static constexpr int kNLoads = kNItems / kNElts;
+    static constexpr bool kIsComplex = std::is_same_v<weight_t, complex_t>;
+    static constexpr bool kIsEvenLen = kIsEvenLen_;
+    static constexpr bool kIsVariableB = kIsVariableB_;
+    static constexpr bool kIsVariableC = kIsVariableC_;
+    static constexpr bool kDeltaSoftplus = kDeltaSoftplus_;
+    static constexpr bool kHasZ = kHasZ_;
+    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads with float improves occupancy.
+    // For complex this would lead to massive register spilling, so we keep it at 2.
+    static constexpr int kMinBlocks = kNThreads == 128 && !kIsComplex ? 3 : 2;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using scan_t = std::conditional_t<!kIsComplex, float2, float4>;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, !kIsComplex ? kNItems : kNItems * 2, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, !kIsComplex ? kNLoads : kNLoads * 2, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, kNLoads, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING_MEMOIZE>;
+    using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
+    using BlockReverseScanT = BlockReverseScan<scan_t, kNThreads>;
+    using BlockReduceT = cub::BlockReduce<scan_t, kNThreads>;
+    using BlockReduceFloatT = cub::BlockReduce<float, kNThreads>;
+    using BlockReduceComplexT = cub::BlockReduce<complex_t, kNThreads>;
+    using BlockExchangeT = cub::BlockExchange<float, kNThreads, !kIsComplex ? kNItems : kNItems * 2>;
+    static constexpr int kSmemIOSize = std::max({sizeof(typename BlockLoadT::TempStorage),
+                                                 sizeof(typename BlockLoadVecT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
+                                                 sizeof(typename BlockStoreT::TempStorage),
+                                                 sizeof(typename BlockStoreVecT::TempStorage)});
+    static constexpr int kSmemExchangeSize = (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockExchangeT::TempStorage);
+    static constexpr int kSmemReduceSize = sizeof(typename BlockReduceT::TempStorage);
+    static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize + kSmemReduceSize + sizeof(typename BlockScanT::TempStorage) + sizeof(typename BlockReverseScanT::TempStorage);
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads, Ktraits::kMinBlocks)
+void selective_scan_bwd_kernel(SSMParamsBwd params) {
+    constexpr bool kIsComplex = Ktraits::kIsComplex;
+    constexpr bool kIsVariableB = Ktraits::kIsVariableB;
+    constexpr bool kIsVariableC = Ktraits::kIsVariableC;
+    constexpr bool kDeltaSoftplus = Ktraits::kDeltaSoftplus;
+    constexpr bool kHasZ = Ktraits::kHasZ;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNItems = Ktraits::kNItems;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+    using scan_t = typename Ktraits::scan_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    // cast to lvalue reference of expected type
+    // char *smem_loadstorescan = smem_ + 2 * MAX_DSTATE * sizeof(weight_t);
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_ + 2 * MAX_DSTATE * sizeof(weight_t));
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
+    auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_exchange = *reinterpret_cast<typename Ktraits::BlockExchangeT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
+    auto& smem_exchange1 = *reinterpret_cast<typename Ktraits::BlockExchangeT::TempStorage*>(smem_ + Ktraits::kSmemIOSize + sizeof(typename Ktraits::BlockExchangeT::TempStorage));
+    auto& smem_reduce = *reinterpret_cast<typename Ktraits::BlockReduceT::TempStorage*>(reinterpret_cast<char *>(&smem_exchange) + Ktraits::kSmemExchangeSize);
+    auto& smem_reduce_float = *reinterpret_cast<typename Ktraits::BlockReduceFloatT::TempStorage*>(&smem_reduce);
+    auto& smem_reduce_complex = *reinterpret_cast<typename Ktraits::BlockReduceComplexT::TempStorage*>(&smem_reduce);
+    auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(reinterpret_cast<char *>(&smem_reduce) + Ktraits::kSmemReduceSize);
+    auto& smem_reverse_scan = *reinterpret_cast<typename Ktraits::BlockReverseScanT::TempStorage*>(reinterpret_cast<char *>(&smem_scan) + sizeof(typename Ktraits::BlockScanT::TempStorage));
+    weight_t *smem_delta_a = reinterpret_cast<weight_t *>(smem_ + Ktraits::kSmemSize);
+    scan_t *smem_running_postfix = reinterpret_cast<scan_t *>(smem_delta_a + 2 * MAX_DSTATE + kNThreads);
+    weight_t *smem_da = reinterpret_cast<weight_t *>(smem_running_postfix + MAX_DSTATE);
+    weight_t *smem_dbc = reinterpret_cast<weight_t *>(smem_da + MAX_DSTATE);
+
+    const int batch_id = blockIdx.x;
+    const int dim_id = blockIdx.y;
+    const int group_id = dim_id / (params.dim_ngroups_ratio);
+    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride
+        + dim_id * params.u_d_stride;
+    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride
+        + dim_id * params.delta_d_stride;
+    input_t *dout = reinterpret_cast<input_t *>(params.dout_ptr) + batch_id * params.dout_batch_stride
+        + dim_id * params.dout_d_stride;
+    weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * params.A_d_stride;
+    weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * params.B_d_stride;
+    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride;
+    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * params.C_d_stride;
+    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
+    weight_t *dA = reinterpret_cast<weight_t *>(params.dA_ptr) + dim_id * params.dA_d_stride;
+    weight_t *dB = reinterpret_cast<weight_t *>(params.dB_ptr)
+        + (!kIsVariableB ? dim_id * params.dB_d_stride : batch_id * (!kIsComplex ? params.dB_batch_stride : params.dB_batch_stride / 2) + group_id * params.dB_group_stride);
+    weight_t *dC = reinterpret_cast<weight_t *>(params.dC_ptr)
+        + (!kIsVariableC ? dim_id * params.dC_d_stride : batch_id * (!kIsComplex ? params.dC_batch_stride : params.dC_batch_stride / 2) + group_id * params.dC_group_stride);
+    float *dD = params.dD_ptr == nullptr ? nullptr : reinterpret_cast<float *>(params.dD_ptr) + dim_id;
+    float D_val = params.D_ptr == nullptr ? 0 : reinterpret_cast<float *>(params.D_ptr)[dim_id];
+    float *ddelta_bias = params.ddelta_bias_ptr == nullptr ? nullptr : reinterpret_cast<float *>(params.ddelta_bias_ptr) + dim_id;
+    float delta_bias = params.delta_bias_ptr == nullptr ? 0 : reinterpret_cast<float *>(params.delta_bias_ptr)[dim_id];
+    scan_t *x = params.x_ptr == nullptr
+        ? nullptr
+        : reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id) * (params.n_chunks) * params.dstate;
+    float dD_val = 0;
+    float ddelta_bias_val = 0;
+
+    constexpr int kChunkSize = kNThreads * kNItems;
+    u += (params.n_chunks - 1) * kChunkSize;
+    delta += (params.n_chunks - 1) * kChunkSize;
+    dout += (params.n_chunks - 1) * kChunkSize;
+    Bvar += (params.n_chunks - 1) * kChunkSize * (!kIsComplex ? 1 : 2);
+    Cvar += (params.n_chunks - 1) * kChunkSize * (!kIsComplex ? 1 : 2);
+    for (int chunk = params.n_chunks - 1; chunk >= 0; --chunk) {
+        input_t u_vals[kNItems];
+        input_t delta_vals_load[kNItems];
+        input_t dout_vals_load[kNItems];
+        __syncthreads();
+        load_input<Ktraits>(u, u_vals, smem_load, params.seqlen - chunk * kChunkSize);
+        u -= kChunkSize;
+        __syncthreads();
+        load_input<Ktraits>(delta, delta_vals_load, smem_load, params.seqlen - chunk * kChunkSize);
+        // Will reload delta at the same location if kDeltaSoftplus
+        if constexpr (!kDeltaSoftplus) { delta -= kChunkSize; }
+        __syncthreads();
+        load_input<Ktraits>(dout, dout_vals_load, smem_load, params.seqlen - chunk * kChunkSize);
+        dout -= kChunkSize;
+
+        float dout_vals[kNItems], delta_vals[kNItems];
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) {
+            dout_vals[i] = float(dout_vals_load[i]);
+            delta_vals[i] = float(delta_vals_load[i]) + delta_bias;
+            if constexpr (kDeltaSoftplus) {
+                delta_vals[i] = delta_vals[i] <= 20.f ? log1pf(expf(delta_vals[i])) : delta_vals[i];
+            }
+        }
+
+        if constexpr (kHasZ) {
+            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride
+                + dim_id * params.z_d_stride + chunk * kChunkSize;
+            input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+                + dim_id * params.out_d_stride + chunk * kChunkSize;
+            input_t *dz = reinterpret_cast<input_t *>(params.dz_ptr) + batch_id * params.dz_batch_stride
+                + dim_id * params.dz_d_stride + chunk * kChunkSize;
+            input_t z_vals[kNItems], out_vals[kNItems];
+            __syncthreads();
+            load_input<Ktraits>(z, z_vals, smem_load, params.seqlen - chunk * kChunkSize);
+            __syncthreads();
+            load_input<Ktraits>(out, out_vals, smem_load, params.seqlen - chunk * kChunkSize);
+            float dz_vals[kNItems], z_silu_vals[kNItems];
+            #pragma unroll
+            for (int i = 0; i < kNItems; ++i) {
+                float z_val = z_vals[i];
+                float z_sigmoid_val = 1.0f / (1.0f + expf(-z_val));
+                z_silu_vals[i] = z_val * z_sigmoid_val;
+                dz_vals[i] = dout_vals[i] * float(out_vals[i]) * z_sigmoid_val
+                             * (1.0f + z_val * (1.0f - z_sigmoid_val));
+                dout_vals[i] *= z_silu_vals[i];
+            }
+            __syncthreads();
+            store_output<Ktraits>(dz, dz_vals, smem_store, params.seqlen - chunk * kChunkSize);
+            if (params.out_z_ptr != nullptr) {  // Recompute and store out_z
+                float out_z_vals[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) { out_z_vals[i] = float(out_vals[i]) * z_silu_vals[i]; }
+                // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0) {
+                    // printf("out_val=%f, z_silu_val = %f, out_z_val = %f\n", float(out_vals[0]), z_silu_vals[0], out_z_vals[0]);
+                // }
+                input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride
+                    + dim_id * params.out_z_d_stride + chunk * kChunkSize;
+                __syncthreads();
+                store_output<Ktraits>(out_z, out_z_vals, smem_store, params.seqlen - chunk * kChunkSize);
+            }
+        }
+
+        float du_vals[kNItems];
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) { du_vals[i] = D_val * dout_vals[i]; }
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) { dD_val += dout_vals[i] * float(u_vals[i]); }
+
+        float ddelta_vals[kNItems] = {0};
+        __syncthreads();
+        for (int state_idx = 0; state_idx < params.dstate; ++state_idx) {
+            const weight_t A_val = A[state_idx * params.A_dstate_stride];
+            // Multiply the real part of A with LOG2E so we can use exp2f instead of expf.
+            weight_t A_scaled;
+            constexpr float kLog2e = M_LOG2E;
+            if constexpr (!kIsComplex) {
+                A_scaled = A_val * kLog2e;
+            } else {
+                A_scaled = complex_t(A_val.real_ * kLog2e, A_val.imag_);
+            }
+            weight_t B_val, C_val;
+            weight_t B_vals[kNItems], C_vals[kNItems];
+            if constexpr (!kIsVariableB) {
+                B_val = B[state_idx * params.B_dstate_stride];
+            } else {
+                load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
+                    smem_load_weight, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+            }
+            if constexpr (!kIsVariableC) {
+                C_val = C[state_idx * params.C_dstate_stride];
+            } else {
+                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
+                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
+                    smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+            }
+            // const weight_t A_val = smem_a[state_idx];
+            scan_t thread_data[kNItems], thread_reverse_data[kNItems];
+            if constexpr (!kIsComplex) {
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    const float delta_a_exp = exp2f(delta_vals[i] * A_scaled);
+                    thread_data[i] = make_float2(delta_a_exp, !kIsVariableB ? delta_vals[i] * float(u_vals[i]) : delta_vals[i] * float(u_vals[i]) * B_vals[i]);
+                    if (i == 0) {
+                        smem_delta_a[threadIdx.x == 0 ? state_idx + (chunk % 2) * MAX_DSTATE : threadIdx.x + 2 * MAX_DSTATE] = delta_a_exp;
+                    } else {
+                        thread_reverse_data[i - 1].x = delta_a_exp;
+                    }
+                    thread_reverse_data[i].y = dout_vals[i] *
+                        (!kIsVariableC
+                         ? (!kIsVariableB ? B_val * C_val : C_val)
+                         : (!kIsVariableB ? B_val * C_vals[i] : C_vals[i]));
+                }
+                __syncthreads();
+                thread_reverse_data[kNItems - 1].x = threadIdx.x == kNThreads - 1
+                    ? (chunk == params.n_chunks - 1 ? 1.f : smem_delta_a[state_idx + ((chunk + 1) % 2) * MAX_DSTATE])
+                    : smem_delta_a[threadIdx.x + 1 + 2 * MAX_DSTATE];
+                // Initialize running total
+                scan_t running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? x[(chunk - 1) * params.dstate + state_idx] : make_float2(1.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
+                Ktraits::BlockScanT(smem_scan).InclusiveScan(
+                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
+                );
+                scan_t running_postfix = chunk < params.n_chunks - 1 && threadIdx.x % 32 == 0 ? smem_running_postfix[state_idx] : make_float2(1.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> postfix_op(running_postfix);
+                Ktraits::BlockReverseScanT(smem_reverse_scan).InclusiveReverseScan(
+                    thread_reverse_data, thread_reverse_data, SSMScanOp<weight_t>(), postfix_op
+                );
+                if (threadIdx.x == 0) { smem_running_postfix[state_idx] = postfix_op.running_prefix; }
+                weight_t dA_val = 0, dBC_val = 0;
+                weight_t dB_vals[kNItems], dC_vals[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    const float dx = thread_reverse_data[i].y;
+                    const float ddelta_u = !kIsVariableB ? dx : dx * B_vals[i];
+                    du_vals[i] += ddelta_u * delta_vals[i];
+                    const float a = thread_data[i].y - (!kIsVariableB ? delta_vals[i] * float(u_vals[i]) : delta_vals[i] * float(u_vals[i]) * B_vals[i]);
+                    ddelta_vals[i] += ddelta_u * float(u_vals[i]) + dx * A_val * a;
+                    dA_val += dx * delta_vals[i] * a;
+                    if constexpr (!kIsVariableB || !kIsVariableC) {
+                        if constexpr (!kIsVariableB) {  // dBC_val is dB_val
+                            dBC_val += dout_vals[i] * (!kIsVariableC ? thread_data[i].y : thread_data[i].y * C_vals[i]);
+                        } else {  // dBC_val is dC_val
+                            dBC_val += dout_vals[i] * thread_data[i].y;
+                        }
+                    }
+                    if constexpr (kIsVariableB) { dB_vals[i] = dx * delta_vals[i] * float(u_vals[i]); }
+                    if constexpr (kIsVariableC) {
+                        dC_vals[i] = dout_vals[i] * (!kIsVariableB ? thread_data[i].y * B_val : thread_data[i].y);
+                    }
+                }
+                // Block-exchange to make the atomicAdd's coalesced, otherwise they're much slower
+                if constexpr (kIsVariableB || kIsVariableC) {
+                    if constexpr (kIsVariableB) {
+                        Ktraits::BlockExchangeT(smem_exchange).BlockedToStriped(dB_vals, dB_vals);
+                    }
+                    if constexpr (kIsVariableC) {
+                        auto &smem_exchange_C = !kIsVariableB ? smem_exchange : smem_exchange1;
+                        Ktraits::BlockExchangeT(smem_exchange_C).BlockedToStriped(dC_vals, dC_vals);
+                    }
+                    const int seqlen_remaining = params.seqlen - chunk * kChunkSize - threadIdx.x;
+                    weight_t *dB_cur = dB + state_idx * params.dB_dstate_stride + chunk * kChunkSize + threadIdx.x;
+                    weight_t *dC_cur = dC + state_idx * params.dC_dstate_stride + chunk * kChunkSize + threadIdx.x;
+                    #pragma unroll
+                    for (int i = 0; i < kNItems; ++i) {
+                        if (i * kNThreads < seqlen_remaining) {
+                            if constexpr (kIsVariableB) { gpuAtomicAdd(dB_cur + i * kNThreads, dB_vals[i]); }
+                            if constexpr (kIsVariableC) { gpuAtomicAdd(dC_cur + i * kNThreads, dC_vals[i]); }
+                        }
+                    }
+                }
+                if constexpr (!kIsVariableB || !kIsVariableC) {
+                    float2 dA_dBC_val = make_float2(dA_val, dBC_val);
+                    dA_dBC_val = Ktraits::BlockReduceT(smem_reduce).Sum(dA_dBC_val);
+                    dA_val = dA_dBC_val.x;
+                    if (threadIdx.x == 0) {
+                        smem_dbc[state_idx] = chunk == params.n_chunks - 1 ? dA_dBC_val.y : dA_dBC_val.y + smem_dbc[state_idx];
+                    }
+                } else {
+                    dA_val = Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dA_val);
+                }
+                if (threadIdx.x == 0) {
+                    smem_da[state_idx] = chunk == params.n_chunks - 1 ? dA_val : dA_val + smem_da[state_idx];
+                }
+            } else {
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    // Pytorch's implementation of complex exp (which calls thrust) is very slow
+                    complex_t delta_a_exp = cexp2f(delta_vals[i] * A_scaled);
+                    weight_t B_delta_u_val = !kIsVariableB ? delta_vals[i] * float(u_vals[i]) : B_vals[i] * delta_vals[i] * float(u_vals[i]);
+                    thread_data[i] = make_float4(delta_a_exp.real_, delta_a_exp.imag_, B_delta_u_val.real_, B_delta_u_val.imag_);
+                    if (i == 0) {
+                        smem_delta_a[threadIdx.x == 0 ? state_idx + (chunk % 2) * MAX_DSTATE : threadIdx.x + 2 * MAX_DSTATE] = delta_a_exp;
+                    } else {
+                        thread_reverse_data[i - 1].x = delta_a_exp.real_;
+                        thread_reverse_data[i - 1].y = -delta_a_exp.imag_;
+                    }
+                    complex_t dout_BC = 2 * dout_vals[i]
+                        * conj(!kIsVariableC
+                                ? (!kIsVariableB ? B_val * C_val : C_val)
+                                : (!kIsVariableB ? B_val * C_vals[i] : C_vals[i]));
+                    thread_reverse_data[i].z = dout_BC.real_;
+                    thread_reverse_data[i].w = dout_BC.imag_;
+                }
+                __syncthreads();
+                complex_t delta_a_exp = threadIdx.x == kNThreads - 1
+                    ? (chunk == params.n_chunks - 1 ? 1.f : smem_delta_a[state_idx + ((chunk + 1) % 2) * MAX_DSTATE])
+                    : smem_delta_a[threadIdx.x + 1 + 2 * MAX_DSTATE];
+                thread_reverse_data[kNItems - 1].x = delta_a_exp.real_;
+                thread_reverse_data[kNItems - 1].y = -delta_a_exp.imag_;
+                // Initialize running total
+                scan_t running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? x[(chunk - 1) * params.dstate + state_idx] : make_float4(1.f, 0.f, 0.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
+                Ktraits::BlockScanT(smem_scan).InclusiveScan(
+                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
+                );
+                scan_t running_postfix = chunk < params.n_chunks - 1 && threadIdx.x % 32 == 0 ? smem_running_postfix[state_idx] : make_float4(1.f, 0.f, 0.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> postfix_op(running_postfix);
+                Ktraits::BlockReverseScanT(smem_reverse_scan).InclusiveReverseScan(
+                    thread_reverse_data, thread_reverse_data, SSMScanOp<weight_t>(), postfix_op
+                );
+                if (threadIdx.x == 0) { smem_running_postfix[state_idx] = postfix_op.running_prefix; }
+                weight_t dA_val = 0, dBC_val = 0;
+                weight_t dB_vals[kNItems], dC_vals[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    complex_t x = complex_t(thread_data[i].z, thread_data[i].w);
+                    complex_t dx = complex_t(thread_reverse_data[i].z, thread_reverse_data[i].w);
+                    float ddelta_u = !kIsVariableB ? dx.real_ : (dx * conj(B_vals[i])).real_;
+                    if constexpr (!kIsVariableB || !kIsVariableC) {
+                        if constexpr (!kIsVariableB) {  // dBC_val is dB_val
+                            dBC_val += (2 * dout_vals[i]) * conj(!kIsVariableC ? x : x * C_vals[i]);
+                        } else {  // dBC_val is dC_val
+                            dBC_val += (2 * dout_vals[i]) * conj(x);
+                        }
+                    }
+                    const complex_t a_conj = conj(x - (!kIsVariableB ? delta_vals[i] * float(u_vals[i]) : delta_vals[i] * float(u_vals[i]) * B_vals[i]));
+                    du_vals[i] += ddelta_u * delta_vals[i];
+                    ddelta_vals[i] += ddelta_u * float(u_vals[i]) + (dx * conj(A_val) * a_conj).real_;
+                    dA_val += delta_vals[i] * dx * a_conj;
+                    if constexpr (kIsVariableB) { dB_vals[i] = dx * delta_vals[i] * float(u_vals[i]); }
+                    if constexpr (kIsVariableC) {
+                        dC_vals[i] = (2 * dout_vals[i]) * conj(!kIsVariableB ? x * B_val : x);
+                    }
+                }
+                // Block-exchange to make the atomicAdd's coalesced, otherwise they're much slower
+                if constexpr (kIsVariableB || kIsVariableC) {
+                    float dB_vals_f[kNItems * 2], dC_vals_f[kNItems * 2];
+                    if constexpr (kIsVariableB) {
+                        #pragma unroll
+                        for (int i = 0; i < kNItems; ++i) {
+                            dB_vals_f[i * 2] = dB_vals[i].real_;
+                            dB_vals_f[i * 2 + 1] = dB_vals[i].imag_;
+                        }
+                        Ktraits::BlockExchangeT(smem_exchange).BlockedToStriped(dB_vals_f, dB_vals_f);
+                    }
+                    if constexpr (kIsVariableC) {
+                        #pragma unroll
+                        for (int i = 0; i < kNItems; ++i) {
+                            dC_vals_f[i * 2] = dC_vals[i].real_;
+                            dC_vals_f[i * 2 + 1] = dC_vals[i].imag_;
+                        }
+                        auto &smem_exchange_C = !kIsVariableB ? smem_exchange : smem_exchange1;
+                        Ktraits::BlockExchangeT(smem_exchange_C).BlockedToStriped(dC_vals_f, dC_vals_f);
+                    }
+                    const int seqlen_remaining = (params.seqlen - chunk * kChunkSize) * 2 - threadIdx.x;
+                    float *dB_cur = reinterpret_cast<float *>(dB) + state_idx * params.dB_dstate_stride + chunk * kChunkSize * 2 + threadIdx.x;
+                    float *dC_cur = reinterpret_cast<float *>(dC) + state_idx * params.dC_dstate_stride + chunk * kChunkSize * 2 + threadIdx.x;
+                    #pragma unroll
+                    for (int i = 0; i < kNItems * 2; ++i) {
+                        if (i * kNThreads < seqlen_remaining) {
+                            if constexpr (kIsVariableB) { gpuAtomicAdd(dB_cur + i * kNThreads, dB_vals_f[i]); }
+                            if constexpr (kIsVariableC) { gpuAtomicAdd(dC_cur + i * kNThreads, dC_vals_f[i]); }
+                        }
+                    }
+                }
+                if constexpr (!kIsVariableB || !kIsVariableC) {
+                    float4 dA_dBC_val = make_float4(dA_val.real_, dA_val.imag_, dBC_val.real_, dBC_val.imag_);
+                    dA_dBC_val = Ktraits::BlockReduceT(smem_reduce).Sum(dA_dBC_val);
+                    dA_val = complex_t(dA_dBC_val.x, dA_dBC_val.y);
+                    dBC_val = complex_t(dA_dBC_val.z, dA_dBC_val.w);
+                    if (threadIdx.x == 0) {
+                        smem_dbc[state_idx] = chunk == params.n_chunks - 1 ? dBC_val : dBC_val + smem_dbc[state_idx];
+                    }
+                } else {
+                    dA_val = Ktraits::BlockReduceComplexT(smem_reduce_complex).Sum(dA_val);
+                }
+                if (threadIdx.x == 0) {
+                    smem_da[state_idx] = chunk == params.n_chunks - 1 ? dA_val : dA_val + smem_da[state_idx];
+                }
+            }
+        }
+
+        if constexpr (kDeltaSoftplus) {
+            __syncthreads();
+            input_t delta_vals_load[kNItems];
+            load_input<Ktraits>(delta, delta_vals_load, smem_load, params.seqlen - chunk * kChunkSize);
+            delta -= kChunkSize;
+            #pragma unroll
+            for (int i = 0; i < kNItems; ++i) {
+                float delta_val = float(delta_vals_load[i]) + delta_bias;
+                float delta_val_neg_exp = expf(-delta_val);
+                ddelta_vals[i] = delta_val <= 20.f
+                    ? ddelta_vals[i] / (1.f + delta_val_neg_exp)
+                    : ddelta_vals[i];
+            }
+        }
+        for (int i = 0; i < kNItems; ++i) { ddelta_bias_val += ddelta_vals[i]; }
+
+        input_t *du = reinterpret_cast<input_t *>(params.du_ptr) + batch_id * params.du_batch_stride
+            + dim_id * params.du_d_stride + chunk * kChunkSize;
+        input_t *ddelta = reinterpret_cast<input_t *>(params.ddelta_ptr) + batch_id * params.ddelta_batch_stride
+            + dim_id * params.ddelta_d_stride + chunk * kChunkSize;
+        __syncthreads();
+        store_output<Ktraits>(du, du_vals, smem_store, params.seqlen - chunk * kChunkSize);
+        __syncthreads();
+        store_output<Ktraits>(ddelta, ddelta_vals, smem_store, params.seqlen - chunk * kChunkSize);
+
+        Bvar -= kChunkSize * (!kIsComplex ? 1 : 2);
+        Cvar -= kChunkSize * (!kIsComplex ? 1 : 2);
+    }
+    if (params.dD_ptr != nullptr) {
+        dD_val = Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dD_val);
+        if (threadIdx.x == 0) { gpuAtomicAdd(dD, dD_val); }
+    }
+    if (params.ddelta_bias_ptr != nullptr) {
+        __syncthreads();
+        ddelta_bias_val = Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(ddelta_bias_val);
+        if (threadIdx.x == 0) { gpuAtomicAdd(ddelta_bias, ddelta_bias_val); }
+    }
+    for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
+        gpuAtomicAdd(&(dA[state_idx * params.dA_dstate_stride]), smem_da[state_idx]);
+        weight_t dBC_val;
+        if (!kIsVariableB || !kIsVariableC) { dBC_val = smem_dbc[state_idx]; }
+        if constexpr (!kIsVariableB) {
+            gpuAtomicAdd(&(dB[state_idx * params.dB_dstate_stride]),
+                         !kIsVariableC ? dBC_val * conj(C[state_idx * params.C_dstate_stride]) : dBC_val);
+        }
+        if constexpr (!kIsVariableC) {
+            gpuAtomicAdd(&(dC[state_idx * params.dC_dstate_stride]),
+                        !kIsVariableB ? dBC_val * conj(B[state_idx * params.B_dstate_stride]) : dBC_val);
+        }
+    }
+}
+
+template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+void selective_scan_bwd_launch(SSMParamsBwd &params, cudaStream_t stream) {
+    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
+        BOOL_SWITCH(params.is_variable_B, kIsVariableB, [&] {
+            BOOL_SWITCH(params.is_variable_C, kIsVariableC, [&] {
+                BOOL_SWITCH(params.delta_softplus, kDeltaSoftplus, [&] {
+                    BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
+                        using Ktraits = Selective_Scan_bwd_kernel_traits<kNThreads, kNItems, kIsEvenLen, kIsVariableB, kIsVariableC, kDeltaSoftplus, kHasZ, input_t, weight_t>;
+                        // using Ktraits = Selective_Scan_bwd_kernel_traits<kNThreads, kNItems, true, kIsVariableB, kIsVariableC, kDeltaSoftplus, kHasZ, input_t, weight_t>;
+                        // TODO: check this
+                        constexpr int kSmemSize = Ktraits::kSmemSize + MAX_DSTATE * sizeof(typename Ktraits::scan_t) + (kNThreads + 4 * MAX_DSTATE) * sizeof(typename Ktraits::weight_t);
+                        // printf("smem_size = %d\n", kSmemSize);
+                        dim3 grid(params.batch, params.dim);
+                        auto kernel = &selective_scan_bwd_kernel<Ktraits>;
+                        if (kSmemSize >= 48 * 1024) {
+                            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                        }
+                        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+                        C10_CUDA_KERNEL_LAUNCH_CHECK();
+                    });
+                });
+            });
+        });
+    });
+}
+
+template<typename input_t, typename weight_t>
+void selective_scan_bwd_cuda(SSMParamsBwd &params, cudaStream_t stream) {
+    if (params.seqlen <= 128) {
+        selective_scan_bwd_launch<32, 4, input_t, weight_t>(params, stream);
+    } else if (params.seqlen <= 256) {
+        selective_scan_bwd_launch<32, 8, input_t, weight_t>(params, stream);
+    } else if (params.seqlen <= 512) {
+        selective_scan_bwd_launch<32, 16, input_t, weight_t>(params, stream);
+    } else if (params.seqlen <= 1024) {
+        selective_scan_bwd_launch<64, 16, input_t, weight_t>(params, stream);
+    } else {
+        selective_scan_bwd_launch<128, 16, input_t, weight_t>(params, stream);
+    }
+}
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_common.h b/mamba/csrc/selective_scan/selective_scan_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..9140dcdf3b68ad2de95bcd3fd9543a9d320cef68
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_common.h
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <c10/util/complex.h>  // For scalar_value_type
+
+#define MAX_DSTATE 256
+
+using complex_t = c10::complex<float>;
+
+inline __device__ float2 operator+(const float2 & a, const float2 & b){
+    return {a.x + b.x, a.y + b.y};
+}
+
+inline __device__ float3 operator+(const float3 &a, const float3 &b) {
+  return {a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+inline __device__ float4 operator+(const float4 & a, const float4 & b){
+    return {a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int BYTES> struct BytesToType {};
+
+template<> struct BytesToType<16> {
+    using Type = uint4;
+    static_assert(sizeof(Type) == 16);
+};
+
+template<> struct BytesToType<8> {
+    using Type = uint64_t;
+    static_assert(sizeof(Type) == 8);
+};
+
+template<> struct BytesToType<4> {
+    using Type = uint32_t;
+    static_assert(sizeof(Type) == 4);
+};
+
+template<> struct BytesToType<2> {
+    using Type = uint16_t;
+    static_assert(sizeof(Type) == 2);
+};
+
+template<> struct BytesToType<1> {
+    using Type = uint8_t;
+    static_assert(sizeof(Type) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename scalar_t, int N>
+struct Converter{
+    static inline __device__ void to_float(const scalar_t (&src)[N], float (&dst)[N]) {
+        #pragma unroll
+        for (int i = 0; i < N; ++i) { dst[i] = src[i]; }
+    }
+};
+
+template<int N>
+struct Converter<at::Half, N>{
+    static inline __device__ void to_float(const at::Half (&src)[N], float (&dst)[N]) {
+        static_assert(N % 2 == 0);
+        auto &src2 = reinterpret_cast<const half2 (&)[N / 2]>(src);
+        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
+        #pragma unroll
+        for (int i = 0; i < N / 2; ++i) { dst2[i] = __half22float2(src2[i]); }
+    }
+};
+
+#if __CUDA_ARCH__ >= 800
+template<int N>
+struct Converter<at::BFloat16, N>{
+    static inline __device__ void to_float(const at::BFloat16 (&src)[N], float (&dst)[N]) {
+        static_assert(N % 2 == 0);
+        auto &src2 = reinterpret_cast<const nv_bfloat162 (&)[N / 2]>(src);
+        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
+        #pragma unroll
+        for (int i = 0; i < N / 2; ++i) { dst2[i] = __bfloat1622float2(src2[i]); }
+    }
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// From https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp
+// and https://forums.developer.nvidia.com/t/complex-number-exponential-function/24696
+__device__ __forceinline__ complex_t cexp2f(complex_t z) {
+    float t = exp2f(z.real_);
+    float c, s;
+    sincosf(z.imag_, &s, &c);
+    return complex_t(c * t, s * t);
+}
+
+__device__ __forceinline__ complex_t cexpf(complex_t z) {
+    float t = expf(z.real_);
+    float c, s;
+    sincosf(z.imag_, &s, &c);
+    return complex_t(c * t, s * t);
+}
+
+template<typename scalar_t> struct SSMScanOp;
+
+template<>
+struct SSMScanOp<float> {
+    __device__ __forceinline__ float2 operator()(const float2 &ab0, const float2 &ab1) const {
+        return make_float2(ab1.x * ab0.x, ab1.x * ab0.y + ab1.y);
+    }
+};
+
+template<>
+struct SSMScanOp<complex_t> {
+    __device__ __forceinline__ float4 operator()(const float4 &ab0, const float4 &ab1) const {
+        complex_t a0 = complex_t(ab0.x, ab0.y);
+        complex_t b0 = complex_t(ab0.z, ab0.w);
+        complex_t a1 = complex_t(ab1.x, ab1.y);
+        complex_t b1 = complex_t(ab1.z, ab1.w);
+        complex_t out_a = a1 * a0;
+        complex_t out_b = a1 * b0 + b1;
+        return make_float4(out_a.real_, out_a.imag_, out_b.real_, out_b.imag_);
+    }
+};
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+template <typename scalar_t> struct SSMScanPrefixCallbackOp {
+    using scan_t = std::conditional_t<std::is_same_v<scalar_t, float>, float2, float4>;
+    scan_t running_prefix;
+    // Constructor
+    __device__ SSMScanPrefixCallbackOp(scan_t running_prefix_) : running_prefix(running_prefix_) {}
+    // Callback operator to be entered by the first warp of threads in the block.
+    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    __device__ scan_t operator()(scan_t block_aggregate) {
+        scan_t old_prefix = running_prefix;
+        running_prefix = SSMScanOp<scalar_t>()(running_prefix, block_aggregate);
+        return old_prefix;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Ktraits>
+inline __device__ void load_input(typename Ktraits::input_t *u,
+                                  typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
+                                  typename Ktraits::BlockLoadT::TempStorage &smem_load,
+                                  int seqlen) {
+    if constexpr (Ktraits::kIsEvenLen) {
+        auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
+        using vec_t = typename Ktraits::vec_t;
+        Ktraits::BlockLoadVecT(smem_load_vec).Load(
+            reinterpret_cast<vec_t*>(u),
+            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(u_vals)
+       );
+    } else {
+        Ktraits::BlockLoadT(smem_load).Load(u, u_vals, seqlen, 0.f);
+    }
+}
+
+template<typename Ktraits>
+inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
+                                   typename Ktraits::weight_t (&B_vals)[Ktraits::kNItems],
+                                   typename Ktraits::BlockLoadWeightT::TempStorage &smem_load_weight,
+                                   int seqlen) {
+    constexpr int kNItems = Ktraits::kNItems;
+    if constexpr (!Ktraits::kIsComplex) {
+        typename Ktraits::input_t B_vals_load[kNItems];
+        if constexpr (Ktraits::kIsEvenLen) {
+            auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
+            using vec_t = typename Ktraits::vec_t;
+            Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
+                reinterpret_cast<vec_t*>(Bvar),
+                reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(B_vals_load)
+          );
+        } else {
+            Ktraits::BlockLoadWeightT(smem_load_weight).Load(Bvar, B_vals_load, seqlen, 0.f);
+        }
+        // #pragma unroll
+        // for (int i = 0; i < kNItems; ++i) { B_vals[i] = B_vals_load[i]; }
+        Converter<typename Ktraits::input_t, kNItems>::to_float(B_vals_load, B_vals);
+    } else {
+        typename Ktraits::input_t B_vals_load[kNItems * 2];
+        if constexpr (Ktraits::kIsEvenLen) {
+            auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
+            using vec_t = typename Ktraits::vec_t;
+            Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
+                reinterpret_cast<vec_t*>(Bvar),
+                reinterpret_cast<vec_t(&)[Ktraits::kNLoads * 2]>(B_vals_load)
+          );
+        } else {
+            Ktraits::BlockLoadWeightT(smem_load_weight).Load(Bvar, B_vals_load, seqlen, 0.f);
+        }
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) { B_vals[i] = complex_t(B_vals_load[i * 2], B_vals_load[i * 2 + 1]); }
+    }
+}
+
+template<typename Ktraits>
+inline __device__ void store_output(typename Ktraits::input_t *out,
+                                    const float (&out_vals)[Ktraits::kNItems],
+                                    typename Ktraits::BlockStoreT::TempStorage &smem_store,
+                                    int seqlen) {
+    typename Ktraits::input_t write_vals[Ktraits::kNItems];
+    #pragma unroll
+    for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
+    if constexpr (Ktraits::kIsEvenLen) {
+        auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
+        using vec_t = typename Ktraits::vec_t;
+        Ktraits::BlockStoreVecT(smem_store_vec).Store(
+            reinterpret_cast<vec_t*>(out),
+            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(write_vals)
+       );
+    } else {
+        Ktraits::BlockStoreT(smem_store).Store(out, write_vals, seqlen);
+    }
+}
diff --git a/mamba/csrc/selective_scan/selective_scan_fwd_bf16.cu b/mamba/csrc/selective_scan/selective_scan_fwd_bf16.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b8615b1d522c119125d4cb6ff3dce42f2bd4659
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_fwd_bf16.cu
@@ -0,0 +1,10 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_fwd_kernel.cuh"
+
+template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, complex_t>(SSMParamsBase &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_fwd_fp16.cu b/mamba/csrc/selective_scan/selective_scan_fwd_fp16.cu
new file mode 100644
index 0000000000000000000000000000000000000000..015e2a0eff633daf2693e43a2648008652a38c7c
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_fwd_fp16.cu
@@ -0,0 +1,10 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_fwd_kernel.cuh"
+
+template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, complex_t>(SSMParamsBase &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_fwd_fp32.cu b/mamba/csrc/selective_scan/selective_scan_fwd_fp32.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c142fe0208ea784679122ba04997d3432b05efcc
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_fwd_fp32.cu
@@ -0,0 +1,10 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_fwd_kernel.cuh"
+
+template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, complex_t>(SSMParamsBase &params, cudaStream_t stream);
\ No newline at end of file
diff --git a/mamba/csrc/selective_scan/selective_scan_fwd_kernel.cuh b/mamba/csrc/selective_scan/selective_scan_fwd_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..440a209108bfe120c73d123bbf0b82ccf43a5638
--- /dev/null
+++ b/mamba/csrc/selective_scan/selective_scan_fwd_kernel.cuh
@@ -0,0 +1,345 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+
+#include "selective_scan.h"
+#include "selective_scan_common.h"
+#include "static_switch.h"
+
+template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
+         bool kIsVariableB_, bool kIsVariableC_,
+         bool kHasZ_, typename input_t_, typename weight_t_>
+struct Selective_Scan_fwd_kernel_traits {
+    static_assert(kNItems_ % 4 == 0);
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
+    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
+    static constexpr int kNItems = kNItems_;
+    static constexpr int kNRows = kNRows_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : std::min(8, kNItems);
+    static_assert(kNItems % kNElts == 0);
+    static constexpr int kNLoads = kNItems / kNElts;
+    static constexpr bool kIsComplex = std::is_same_v<weight_t, complex_t>;
+    static constexpr bool kIsEvenLen = kIsEvenLen_;
+    static constexpr bool kIsVariableB = kIsVariableB_;
+    static constexpr bool kIsVariableC = kIsVariableC_;
+    static constexpr bool kHasZ = kHasZ_;
+
+    static constexpr bool kDirectIO = kIsEvenLen && kNLoads == 1;
+
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using scan_t = std::conditional_t<!kIsComplex, float2, float4>;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
+        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
+    using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, !kIsComplex ? kNItems : kNItems * 2, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, !kIsComplex ? kNLoads : kNLoads * 2,
+        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE  : cub::BLOCK_LOAD_DIRECT>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, kNLoads,
+        !kDirectIO ? cub::BLOCK_STORE_WARP_TRANSPOSE : cub::BLOCK_STORE_DIRECT>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING_MEMOIZE>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING>;
+    using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
+    static constexpr int kSmemIOSize = std::max({sizeof(typename BlockLoadT::TempStorage),
+                                                 sizeof(typename BlockLoadVecT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
+                                                 sizeof(typename BlockStoreT::TempStorage),
+                                                 sizeof(typename BlockStoreVecT::TempStorage)});
+    static constexpr int kSmemSize = kSmemIOSize + sizeof(typename BlockScanT::TempStorage);
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads, Ktraits::kMinBlocks)
+void selective_scan_fwd_kernel(SSMParamsBase params) {
+    constexpr bool kIsComplex = Ktraits::kIsComplex;
+    constexpr bool kIsVariableB = Ktraits::kIsVariableB;
+    constexpr bool kIsVariableC = Ktraits::kIsVariableC;
+    constexpr bool kHasZ = Ktraits::kHasZ;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNItems = Ktraits::kNItems;
+    constexpr int kNRows = Ktraits::kNRows;
+    constexpr bool kDirectIO = Ktraits::kDirectIO;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+    using scan_t = typename Ktraits::scan_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    // cast to lvalue reference of expected type
+    // char *smem_loadstorescan = smem_ + 2 * MAX_DSTATE * sizeof(weight_t);
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_ + 2 * MAX_DSTATE * sizeof(weight_t));
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
+    auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
+    // weight_t *smem_a = reinterpret_cast<weight_t *>(smem_ + smem_loadstorescan_size);
+    // weight_t *smem_bc = reinterpret_cast<weight_t *>(smem_a + MAX_DSTATE);
+    scan_t *smem_running_prefix = reinterpret_cast<scan_t *>(smem_ + Ktraits::kSmemSize);
+
+    const int batch_id = blockIdx.x;
+    const int dim_id = blockIdx.y;
+    const int group_id = dim_id / (params.dim_ngroups_ratio);
+    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride
+        + dim_id * kNRows * params.u_d_stride;
+    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride
+        + dim_id * kNRows * params.delta_d_stride;
+    weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
+    weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
+    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride;
+    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
+    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
+    scan_t *x = reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id * kNRows) * params.n_chunks * params.dstate;
+
+    float D_val[kNRows] = {0};
+    if (params.D_ptr != nullptr) {
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            D_val[r] = reinterpret_cast<float *>(params.D_ptr)[dim_id * kNRows + r];
+        }
+    }
+    float delta_bias[kNRows] = {0};
+    if (params.delta_bias_ptr != nullptr) {
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            delta_bias[r] = reinterpret_cast<float *>(params.delta_bias_ptr)[dim_id * kNRows + r];
+        }
+    }
+
+    // for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
+    //     smem_a[state_idx] = A[state_idx * params.A_dstate_stride];
+    //     smem_bc[state_idx] = B[state_idx * params.B_dstate_stride] * C[state_idx * params.C_dstate_stride];
+    // }
+
+    constexpr int kChunkSize = kNThreads * kNItems;
+    for (int chunk = 0; chunk < params.n_chunks; ++chunk) {
+        input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
+        __syncthreads();
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            if constexpr (!kDirectIO) {
+                if (r > 0) { __syncthreads(); }
+            }
+            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, params.seqlen - chunk * kChunkSize);
+            if constexpr (!kDirectIO) { __syncthreads(); }
+            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, params.seqlen - chunk * kChunkSize);
+        }
+        u += kChunkSize;
+        delta += kChunkSize;
+
+        float delta_vals[kNRows][kNItems], delta_u_vals[kNRows][kNItems], out_vals[kNRows][kNItems];
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            #pragma unroll
+            for (int i = 0; i < kNItems; ++i) {
+                float u_val = float(u_vals[r][i]);
+                delta_vals[r][i] = float(delta_vals_load[r][i]) + delta_bias[r];
+                if (params.delta_softplus) {
+                    delta_vals[r][i] = delta_vals[r][i] <= 20.f ? log1pf(expf(delta_vals[r][i])) : delta_vals[r][i];
+                }
+                delta_u_vals[r][i] = delta_vals[r][i] * u_val;
+                out_vals[r][i] = D_val[r] * u_val;
+            }
+        }
+
+        __syncthreads();
+        for (int state_idx = 0; state_idx < params.dstate; ++state_idx) {
+            weight_t A_val[kNRows];
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                A_val[r] = A[state_idx * params.A_dstate_stride + r * params.A_d_stride];
+                // Multiply the real part of A with LOG2E so we can use exp2f instead of expf.
+                constexpr float kLog2e = M_LOG2E;
+                if constexpr (!kIsComplex) {
+                    A_val[r] *= kLog2e;
+                } else {
+                    A_val[r].real_ *= kLog2e;
+                }
+            }
+            // This variable holds B * C if both B and C are constant across seqlen. If only B varies
+            // across seqlen, this holds C. If only C varies across seqlen, this holds B.
+            // If both B and C vary, this is unused.
+            weight_t BC_val[kNRows];
+            weight_t B_vals[kNItems], C_vals[kNItems];
+            if constexpr (kIsVariableB) {
+                load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
+                    smem_load_weight, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+                if constexpr (!kIsVariableC) {
+                    #pragma unroll
+                    for (int r = 0; r < kNRows; ++r) {
+                        BC_val[r] = C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
+                    }
+                }
+            }
+            if constexpr (kIsVariableC) {
+                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
+                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
+                    smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+                if constexpr (!kIsVariableB) {
+                    #pragma unroll
+                    for (int r = 0; r < kNRows; ++r) {
+                        BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride];
+                    }
+                }
+            }
+            if constexpr (!kIsVariableB && !kIsVariableC) {
+                #pragma unroll
+                for (int r = 0; r < kNRows; ++r) {
+                    BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride] * C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
+                }
+            }
+
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                if (r > 0) { __syncthreads(); }  // Scan could be using the same smem
+                scan_t thread_data[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    if constexpr (!kIsComplex) {
+                        thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
+                                                     !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
+                        if constexpr (!Ktraits::kIsEvenLen) {  // So that the last state is correct
+                            if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
+                                thread_data[i] = make_float2(1.f, 0.f);
+                            }
+                        }
+                    } else {
+                        // Pytorch's implementation of complex exp (which calls thrust) is very slow
+                        complex_t delta_a_exp = cexp2f(delta_vals[r][i] * A_val[r]);
+                        weight_t B_delta_u_val = !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i];
+                        thread_data[i] = make_float4(delta_a_exp.real_, delta_a_exp.imag_, B_delta_u_val.real_, B_delta_u_val.imag_);
+                        if constexpr (!Ktraits::kIsEvenLen) {  // So that the last state is correct
+                            if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
+                                thread_data[i] = make_float4(1.f, 0.f, 0.f, 0.f);
+                            }
+                        }
+                    }
+                }
+                // Initialize running total
+                scan_t running_prefix;
+                if constexpr (!kIsComplex) {
+                    // If we use WARP_SCAN then all lane 0 of all warps (not just thread 0) needs to read
+                    running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.f, 0.f);
+                    // running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float2(1.f, 0.f);
+                } else {
+                    running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float4(1.f, 0.f, 0.f, 0.f);
+                    // running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float4(1.f, 0.f, 0.f, 0.f);
+                }
+                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
+                Ktraits::BlockScanT(smem_scan).InclusiveScan(
+                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
+                );
+                // There's a syncthreads in the scan op, so we don't need to sync here.
+                // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
+                if (threadIdx.x == 0) {
+                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
+                    x[(r * params.n_chunks + chunk) * params.dstate + state_idx] = prefix_op.running_prefix;
+                }
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    const weight_t C_val = !kIsVariableC
+                        ? BC_val[r]
+                        : (!kIsVariableB ? BC_val[r] * C_vals[i] : C_vals[i]);
+                    if constexpr (!kIsComplex) {
+                        out_vals[r][i] += thread_data[i].y * C_val;
+                    } else {
+                        out_vals[r][i] += (complex_t(thread_data[i].z, thread_data[i].w) * C_val).real_ * 2;
+                    }
+                }
+            }
+        }
+
+        input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+            + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
+        __syncthreads();
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            if constexpr (!kDirectIO) {
+                if (r > 0) { __syncthreads(); }
+            }
+            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
+        }
+
+        if constexpr (kHasZ) {
+            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride
+                + dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
+            input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride
+                + dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                input_t z_vals[kNItems];
+                __syncthreads();
+                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, params.seqlen - chunk * kChunkSize);
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    float z_val = z_vals[i];
+                    out_vals[r][i] *= z_val / (1 + expf(-z_val));
+                }
+                __syncthreads();
+                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
+            }
+        }
+
+        Bvar += kChunkSize * (!kIsComplex ? 1 : 2);
+        Cvar += kChunkSize * (!kIsComplex ? 1 : 2);
+    }
+}
+
+template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
+    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
+    // processing 1 row.
+    constexpr int kNRows = 1;
+    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
+        BOOL_SWITCH(params.is_variable_B, kIsVariableB, [&] {
+            BOOL_SWITCH(params.is_variable_C, kIsVariableC, [&] {
+                BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
+                    using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, input_t, weight_t>;
+                    // constexpr int kSmemSize = Ktraits::kSmemSize;
+                    constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
+                    // printf("smem_size = %d\n", kSmemSize);
+                    dim3 grid(params.batch, params.dim / kNRows);
+                    auto kernel = &selective_scan_fwd_kernel<Ktraits>;
+                    if (kSmemSize >= 48 * 1024) {
+                        C10_CUDA_CHECK(cudaFuncSetAttribute(
+                            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                    }
+                    kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+                    C10_CUDA_KERNEL_LAUNCH_CHECK();
+                });
+            });
+        });
+    });
+}
+
+template<typename input_t, typename weight_t>
+void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
+    if (params.seqlen <= 128) {
+        selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+    } else if (params.seqlen <= 256) {
+        selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+    } else if (params.seqlen <= 512) {
+        selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+    } else if (params.seqlen <= 1024) {
+        selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+    } else {
+        selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+    }
+}
diff --git a/mamba/csrc/selective_scan/static_switch.h b/mamba/csrc/selective_scan/static_switch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7920ac045d0a2a1f4c4159ee3eebe51fe1e2c203
--- /dev/null
+++ b/mamba/csrc/selective_scan/static_switch.h
@@ -0,0 +1,25 @@
+// Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
+    [&] {                                                                            \
+        if (COND) {                                                                  \
+            constexpr bool CONST_NAME = true;                                        \
+            return __VA_ARGS__();                                                    \
+        } else {                                                                     \
+            constexpr bool CONST_NAME = false;                                       \
+            return __VA_ARGS__();                                                    \
+        }                                                                            \
+    }()
diff --git a/mamba/csrc/selective_scan/uninitialized_copy.cuh b/mamba/csrc/selective_scan/uninitialized_copy.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..630622dddcc9041737307810000584a843a01764
--- /dev/null
+++ b/mamba/csrc/selective_scan/uninitialized_copy.cuh
@@ -0,0 +1,69 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cuda/std/type_traits>
+
+
+namespace detail
+{
+
+#if defined(_NVHPC_CUDA)
+template <typename T, typename U>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  // NVBug 3384810
+  new (ptr) T(::cuda::std::forward<U>(val));
+}
+#else
+template <typename T,
+          typename U,
+          typename ::cuda::std::enable_if<
+            ::cuda::std::is_trivially_copyable<T>::value,
+            int
+          >::type = 0>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  *ptr = ::cuda::std::forward<U>(val);
+}
+
+template <typename T,
+         typename U,
+         typename ::cuda::std::enable_if<
+           !::cuda::std::is_trivially_copyable<T>::value,
+           int
+         >::type = 0>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  new (ptr) T(::cuda::std::forward<U>(val));
+}
+#endif
+
+} // namespace detail
diff --git a/mamba/evals/lm_harness_eval.py b/mamba/evals/lm_harness_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09d40534cf53be4d1387666697c82aa53add625
--- /dev/null
+++ b/mamba/evals/lm_harness_eval.py
@@ -0,0 +1,39 @@
+import torch
+
+import transformers
+from transformers import AutoTokenizer
+
+from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
+
+from lm_eval.api.model import LM
+from lm_eval.models.huggingface import HFLM
+from lm_eval.api.registry import register_model
+from lm_eval.__main__ import cli_evaluate
+
+
+@register_model("mamba")
+class MambaEvalWrapper(HFLM):
+
+    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+
+    def __init__(self, pretrained="state-spaces/mamba-2.8b", max_length=2048, batch_size=None, device="cuda",
+                 dtype=torch.float16):
+        LM.__init__(self)
+        self._model = MambaLMHeadModel.from_pretrained(pretrained, device=device, dtype=dtype)
+        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        self.vocab_size = self.tokenizer.vocab_size
+        self._batch_size = batch_size if batch_size is None else 64
+        self._max_length = max_length
+        self._device = torch.device(device)
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        raise NotImplementedError()
+
+
+if __name__ == "__main__":
+    cli_evaluate()
diff --git a/mamba/mamba_ssm/__init__.py b/mamba/mamba_ssm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ecd144db5dbec72bcfcdcea28c624a7e2bf053b
--- /dev/null
+++ b/mamba/mamba_ssm/__init__.py
@@ -0,0 +1,5 @@
+__version__ = "1.0.1"
+
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn, bimamba_inner_fn
+from mamba_ssm.modules.mamba_simple import Mamba
+from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
diff --git a/mamba/mamba_ssm/models/__init__.py b/mamba/mamba_ssm/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mamba/mamba_ssm/models/mixer_seq_simple.py b/mamba/mamba_ssm/models/mixer_seq_simple.py
new file mode 100644
index 0000000000000000000000000000000000000000..383f773f1f700cd53176e51327a5d8dc58158da0
--- /dev/null
+++ b/mamba/mamba_ssm/models/mixer_seq_simple.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2023, Albert Gu, Tri Dao.
+
+import math
+from functools import partial
+
+from collections import namedtuple
+
+import torch
+import torch.nn as nn
+
+from mamba_ssm.modules.mamba_simple import Mamba, Block
+from mamba_ssm.utils.generation import GenerationMixin
+from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
+
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+
+
+def create_block(
+    d_model,
+    ssm_cfg=None,
+    norm_epsilon=1e-5,
+    rms_norm=False,
+    residual_in_fp32=False,
+    fused_add_norm=False,
+    layer_idx=None,
+    device=None,
+    dtype=None,
+):
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    factory_kwargs = {"device": device, "dtype": dtype}
+    mixer_cls = partial(Mamba, layer_idx=layer_idx, **ssm_cfg, **factory_kwargs)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    block = Block(
+        d_model,
+        mixer_cls,
+        norm_cls=norm_cls,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    block.layer_idx = layer_idx
+    return block
+
+
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+
+
+class MixerModel(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_layer: int,
+        vocab_size: int,
+        ssm_cfg=None,
+        norm_epsilon: float = 1e-5,
+        rms_norm: bool = False,
+        initializer_cfg=None,
+        fused_add_norm=False,
+        residual_in_fp32=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
+
+        # We change the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        self.fused_add_norm = fused_add_norm
+        if self.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
+
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    d_model,
+                    ssm_cfg=ssm_cfg,
+                    norm_epsilon=norm_epsilon,
+                    rms_norm=rms_norm,
+                    residual_in_fp32=residual_in_fp32,
+                    fused_add_norm=fused_add_norm,
+                    layer_idx=i,
+                    **factory_kwargs,
+                )
+                for i in range(n_layer)
+            ]
+        )
+
+        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
+            d_model, eps=norm_epsilon, **factory_kwargs
+        )
+
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+
+    def forward(self, input_ids, inference_params=None):
+        hidden_states = self.embedding(input_ids)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                hidden_states, residual, inference_params=inference_params
+            )
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            # Set prenorm=False here since we don't need the residual
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            hidden_states = fused_add_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+        return hidden_states
+
+
+class MambaLMHeadModel(nn.Module, GenerationMixin):
+
+    def __init__(
+        self,
+        d_model: int,
+        n_layer: int,
+        vocab_size: int,
+        initializer_cfg=None,
+        pad_vocab_size_multiple: int = 1,
+        device=None,
+        dtype=None,
+        **backbone_kwargs,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if vocab_size % pad_vocab_size_multiple != 0:
+            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
+        self.backbone = MixerModel(
+            d_model=d_model,
+            n_layer=n_layer,
+            vocab_size=vocab_size,
+            initializer_cfg=initializer_cfg,
+            **backbone_kwargs,
+            **factory_kwargs,
+        )
+        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
+
+        # Initialize weights and apply final processing
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+        self.tie_weights()
+
+    def tie_weights(self):
+        self.lm_head.weight = self.backbone.embedding.weight
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.backbone.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+
+    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
+        """
+        "position_ids" is just to be compatible with Transformer generation. We don't use it.
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        hidden_states = self.backbone(input_ids, inference_params=inference_params)
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        lm_logits = self.lm_head(hidden_states)
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
+        return CausalLMOutput(logits=lm_logits)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
+        config = load_config_hf(pretrained_model_name)
+        model = cls(**config, device=device, dtype=dtype, **kwargs)
+        model.load_state_dict(load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype))
+        return model
diff --git a/mamba/mamba_ssm/modules/__init__.py b/mamba/mamba_ssm/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mamba/mamba_ssm/modules/mamba_simple.py b/mamba/mamba_ssm/modules/mamba_simple.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a1dd8f808b50d632bbd22f0648d4cb8939cb1e1
--- /dev/null
+++ b/mamba/mamba_ssm/modules/mamba_simple.py
@@ -0,0 +1,418 @@
+# Copyright (c) 2023, Tri Dao, Albert Gu.
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from einops import rearrange, repeat
+
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None
+
+try:
+    from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn, bimamba_inner_fn, mamba_inner_fn_no_out_proj
+except ImportError:
+    selective_scan_fn, mamba_inner_fn, bimamba_inner_fn, mamba_inner_fn_no_out_proj = None, None, None, None, None
+
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+
+
+class Mamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+        bimamba=True,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.bimamba = bimamba
+
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+
+        self.activation = "silu"
+        self.act = nn.SiLU()
+
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+
+        # S4D real initialization
+        # NOTE: why plus 1?
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+
+        # bidirectional
+        # forked from https://github.com/hustvl/Vim
+        if self.bimamba:
+            A_b = repeat(
+                torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+                "n -> d n",
+                d=self.d_inner,
+            ).contiguous()
+            A_b_log = torch.log(A_b)  # Keep A_b_log in fp32
+            self.A_b_log = nn.Parameter(A_b_log)
+            self.A_b_log._no_weight_decay = True 
+
+            self.conv1d_b = nn.Conv1d(
+                in_channels=self.d_inner,
+                out_channels=self.d_inner,
+                bias=conv_bias,
+                kernel_size=d_conv,
+                groups=self.d_inner,
+                padding=d_conv - 1,
+                **factory_kwargs,
+            )
+
+            self.x_proj_b = nn.Linear(
+                self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+            )
+            self.dt_proj_b = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+
+            self.D_b = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+            self.D_b._no_weight_decay = True
+
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+
+    def forward(self, hidden_states, inference_params=None, T=1):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+
+        # We do matmul and transpose BLH -> HBL at the same time
+        # NOTE: same as in_proj(hidden_states) but memory-efficient with the following operations
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
+            "d (b l) -> b d l",
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
+
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if self.use_fast_path and inference_params is None:  # Doesn't support outputting the states
+            if self.bimamba:
+                A_b = -torch.exp(self.A_b_log.float())
+                out = mamba_inner_fn_no_out_proj(
+                    xz,
+                    self.conv1d.weight,
+                    self.conv1d.bias,
+                    self.x_proj.weight,
+                    self.dt_proj.weight,
+                    A,
+                    None,  # input-dependent B
+                    None,  # input-dependent C
+                    self.D.float(),
+                    delta_bias=self.dt_proj.bias.float(),
+                    delta_softplus=True,
+                )
+                out_b = mamba_inner_fn_no_out_proj(
+                    xz.flip([-1]),
+                    self.conv1d_b.weight,
+                    self.conv1d_b.bias,
+                    self.x_proj_b.weight,
+                    self.dt_proj_b.weight,
+                    A_b,
+                    None,
+                    None,
+                    self.D_b.float(),
+                    delta_bias=self.dt_proj_b.bias.float(),
+                    delta_softplus=True,
+                )
+                out = F.linear(rearrange(out + out_b.flip([-1]), "b d l -> b l d"), self.out_proj.weight, self.out_proj.bias)
+            else:
+                out = mamba_inner_fn(
+                    xz,
+                    self.conv1d.weight,
+                    self.conv1d.bias,
+                    self.x_proj.weight,
+                    self.dt_proj.weight,
+                    self.out_proj.weight,
+                    self.out_proj.bias,
+                    A,
+                    None,  # input-dependent B
+                    None,  # input-dependent C
+                    self.D.float(),
+                    delta_bias=self.dt_proj.bias.float(),
+                    delta_softplus=True,
+                )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                conv_state.copy_(x[:, :, -self.d_conv :])  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                x = causal_conv1d_fn(
+                    x,
+                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    self.conv1d.bias,
+                    self.activation,
+                )
+
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            assert self.activation in ["silu", "swish"]
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, "b d l -> b l d")
+            out = self.out_proj(y)
+        return out
+
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+            dB = torch.einsum("bd,bn->bdn", dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
+            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+
+
+class Block(nn.Module):
+    def __init__(
+        self, dim, mixer_cls, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(dim)
+        self.norm = norm_cls(dim)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states, residual
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
diff --git a/mamba/mamba_ssm/ops/__init__.py b/mamba/mamba_ssm/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mamba/mamba_ssm/ops/selective_scan_interface.py b/mamba/mamba_ssm/ops/selective_scan_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..99b455ed949c123bb453922d5ac88d00f401e392
--- /dev/null
+++ b/mamba/mamba_ssm/ops/selective_scan_interface.py
@@ -0,0 +1,709 @@
+# Copyright (c) 2023, Tri Dao, Albert Gu.
+
+import torch
+import torch.nn.functional as F
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+from einops import rearrange, repeat
+
+from causal_conv1d import causal_conv1d_fn
+import causal_conv1d_cuda
+import selective_scan_cuda
+
+
+class SelectiveScanFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                return_last_state=False):
+        if u.stride(-1) != 1:
+            u = u.contiguous()
+        if delta.stride(-1) != 1:
+            delta = delta.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        if B.stride(-1) != 1:
+            B = B.contiguous()
+        if C.stride(-1) != 1:
+            C = C.contiguous()
+        if z is not None and z.stride(-1) != 1:
+            z = z.contiguous()
+        if B.dim() == 3:
+            B = rearrange(B, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_B = True
+        if C.dim() == 3:
+            C = rearrange(C, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_C = True
+        out, x, *rest = selective_scan_cuda.fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus)
+        ctx.delta_softplus = delta_softplus
+        ctx.has_z = z is not None
+        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
+        if not ctx.has_z:
+            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
+            return out if not return_last_state else (out, last_state)
+        else:
+            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
+            out_z = rest[0]
+            return out_z if not return_last_state else (out_z, last_state)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        if not ctx.has_z:
+            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
+            z = None
+            out = None
+        else:
+            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        # Here we just pass in None and dz will be allocated in the C++ code.
+        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = selective_scan_cuda.bwd(
+            u, delta, A, B, C, D, z, delta_bias, dout, x, out, None, ctx.delta_softplus,
+            False  # option to recompute out_z, not used here
+        )
+        dz = rest[0] if ctx.has_z else None
+        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
+        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
+        return (du, ddelta, dA, dB, dC,
+                dD if D is not None else None,
+                dz,
+                ddelta_bias if delta_bias is not None else None,
+                None,
+                None)
+
+
+def selective_scan_fn(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                     return_last_state=False):
+    """if return_last_state is True, returns (out, last_state)
+    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
+    not considered in the backward pass.
+    """
+    return SelectiveScanFn.apply(u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state)
+
+
+def selective_scan_ref(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                      return_last_state=False):
+    """
+    u: r(B D L)
+    delta: r(B D L)
+    A: c(D N) or r(D N)
+    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    D: r(D)
+    z: r(B D L)
+    delta_bias: r(D), fp32
+
+    out: r(B D L)
+    last_state (optional): r(B D dstate) or c(B D dstate)
+    """
+    dtype_in = u.dtype
+    u = u.float()
+    delta = delta.float()
+    if delta_bias is not None:
+        delta = delta + delta_bias[..., None].float()
+    if delta_softplus:
+        delta = F.softplus(delta)
+    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
+    is_variable_B = B.dim() >= 3
+    is_variable_C = C.dim() >= 3
+    if A.is_complex():
+        if is_variable_B:
+            B = torch.view_as_complex(rearrange(B.float(), "... (L two) -> ... L two", two=2))
+        if is_variable_C:
+            C = torch.view_as_complex(rearrange(C.float(), "... (L two) -> ... L two", two=2))
+    else:
+        B = B.float()
+        C = C.float()
+    x = A.new_zeros((batch, dim, dstate))
+    ys = []
+    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    if not is_variable_B:
+        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+    else:
+        if B.dim() == 3:
+            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+        else:
+            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
+            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+    if is_variable_C and C.dim() == 4:
+        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
+    last_state = None
+    for i in range(u.shape[2]):
+        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
+        if not is_variable_C:
+            y = torch.einsum('bdn,dn->bd', x, C)
+        else:
+            if C.dim() == 3:
+                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+            else:
+                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+        if i == u.shape[2] - 1:
+            last_state = x
+        if y.is_complex():
+            y = y.real * 2
+        ys.append(y)
+    y = torch.stack(ys, dim=2) # (batch dim L)
+    out = y if D is None else y + u * rearrange(D, "d -> d 1")
+    if z is not None:
+        out = out * F.silu(z)
+    out = out.to(dtype=dtype_in)
+    return out if not return_last_state else (out, last_state)
+
+
+class MambaInnerFnNoOutProj(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1):
+        """
+             xz: (batch, dim, seqlen)
+        """
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        out, scan_intermediates, out_z = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        ctx.delta_softplus = delta_softplus
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
+                              delta_proj_weight, conv1d_out, delta,
+                              A, B, C, D, delta_bias, scan_intermediates, out)
+        # return rearrange(out_z, "b d l -> b l d")
+        return out_z
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight, 
+         conv1d_out, delta, A, B, C, D, delta_bias, scan_intermediates, out) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
+                              "d (b l) -> b d l", l = L)
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        # dout_y = rearrange(dout, "b l d -> b d l") # because no arrange at end of forward, so dout shape is b d l
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, dout, scan_intermediates, out, dz,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, dx, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
+                dA, dB, dC, dD,
+                ddelta_bias if delta_bias is not None else None,
+                dB_proj_bias, dC_proj_bias, None)
+    
+
+class MambaInnerFn(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                out_proj_weight, out_proj_bias,
+                A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1):
+        """
+             xz: (batch, dim, seqlen)
+        """
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_bias = (out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
+                             if out_proj_bias is not None else None)
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        out, scan_intermediates, out_z = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        ctx.delta_softplus = delta_softplus
+        ctx.out_proj_bias_is_None = out_proj_bias is None
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
+                              delta_proj_weight, out_proj_weight, conv1d_out, delta,
+                              A, B, C, D, delta_bias, scan_intermediates, out)
+        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight, out_proj_weight,
+         conv1d_out, delta, A, B, C, D, delta_bias, scan_intermediates, out) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
+                              "d (b l) -> b d l", l = L)
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        dout = rearrange(dout, "b l e -> e (b l)")
+        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, dout_y, scan_intermediates, out, dz,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)"))
+        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, dx, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
+                dout_proj_weight, dout_proj_bias,
+                dA, dB, dC, dD,
+                ddelta_bias if delta_bias is not None else None,
+                dB_proj_bias, dC_proj_bias, None)
+
+
+class BiMambaInnerFn(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                out_proj_weight, out_proj_bias,
+                A, A_b, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1):
+        """
+             xz: (batch, dim, seqlen)
+        """
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_bias = (out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
+                             if out_proj_bias is not None else None)
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        out_f, scan_intermediates_f, out_z_f = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        assert not A_b.is_complex(), "A should not be complex!!"
+        out_b, scan_intermediates_b, out_z_b = selective_scan_cuda.fwd(
+            conv1d_out.flip([-1]), delta.flip([-1]), A_b, B.flip([-1]), C.flip([-1]), D, z.flip([-1]), delta_bias, delta_softplus,
+        )
+
+        out_z = out_z_f + out_z_b.flip([-1])
+
+        ctx.delta_softplus = delta_softplus
+        ctx.out_proj_bias_is_None = out_proj_bias is None
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
+                              delta_proj_weight, out_proj_weight, conv1d_out, delta,
+                              A, A_b, B, C, D, delta_bias, scan_intermediates_f, scan_intermediates_b, out_f, out_b)
+        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight, out_proj_weight,
+         conv1d_out, delta, A, A_b, B, C, D, delta_bias, scan_intermediates_f, scan_intermediates_b, out_f, out_b) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
+                              "d (b l) -> b d l", l = L)
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        dout = rearrange(dout, "b l e -> e (b l)")
+        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z_f = selective_scan_cuda.bwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, dout_y, scan_intermediates_f, out_f, dz,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        # flip one
+        dz_b = torch.empty_like(dz)
+        dconv1d_out_f_b, ddelta_f_b, dA_b, dB_f_b, dC_f_b, dD_b, ddelta_bias_b, dz_b, out_z_b = selective_scan_cuda.bwd(
+            conv1d_out.flip([-1]), delta.flip([-1]), A_b, B.flip([-1]), C.flip([-1]), D, z.flip([-1]), delta_bias, dout_y.flip([-1]), scan_intermediates_b, out_b, dz_b,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+
+        dconv1d_out = dconv1d_out + dconv1d_out_f_b.flip([-1])
+        ddelta = ddelta + ddelta_f_b.flip([-1])
+        dB = dB + dB_f_b.flip([-1])
+        dC = dC + dC_f_b.flip([-1])
+        dD = dD + dD_b
+        ddelta_bias = ddelta_bias + ddelta_bias_b
+        dz = dz + dz_b.flip([-1])
+        out_z = out_z_f + out_z_b.flip([-1])
+        
+        dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)"))
+        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, dx, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
+                dout_proj_weight, dout_proj_bias,
+                dA, dA_b, dB, dC, dD,
+                ddelta_bias if delta_bias is not None else None,
+                dB_proj_bias, dC_proj_bias, None)
+    
+
+def mamba_inner_fn(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    return MambaInnerFn.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                              out_proj_weight, out_proj_bias,
+                              A, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus)
+
+def bimamba_inner_fn(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, A_b, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    return BiMambaInnerFn.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                              out_proj_weight, out_proj_bias,
+                              A, A_b, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus)
+
+
+def mamba_inner_fn_no_out_proj(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    return MambaInnerFnNoOutProj.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                              A, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus)
+
+
+def mamba_inner_ref(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    L = xz.shape[-1]
+    delta_rank = delta_proj_weight.shape[1]
+    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+    x, z = xz.chunk(2, dim=1)
+    x = causal_conv1d_fn(x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, "silu")
+    # We're being very careful here about the layout, to avoid extra transposes.
+    # We want delta to have d as the slowest moving dimension
+    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+    x_dbl = F.linear(rearrange(x, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
+    delta = rearrange(delta, "d (b l) -> b d l", l=L)
+    if B is None:  # variable B
+        B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl d)
+        if B_proj_bias is not None:
+            B = B + B_proj_bias.to(dtype=B.dtype)
+        if not A.is_complex():
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            B = rearrange(B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    if C is None:  # variable B
+        C = x_dbl[:, -d_state:]  # (bl d)
+        if C_proj_bias is not None:
+            C = C + C_proj_bias.to(dtype=C.dtype)
+        if not A.is_complex():
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            C = rearrange(C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    y = selective_scan_fn(x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True)
+    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+
+
+def bimamba_inner_ref(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, A_b, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    L = xz.shape[-1]
+    delta_rank = delta_proj_weight.shape[1]
+    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+    x, z = xz.chunk(2, dim=1)
+    x = causal_conv1d_fn(x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, "silu")
+    # We're being very careful here about the layout, to avoid extra transposes.
+    # We want delta to have d as the slowest moving dimension
+    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+    x_dbl = F.linear(rearrange(x, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
+    delta = rearrange(delta, "d (b l) -> b d l", l=L)
+    if B is None:  # variable B
+        B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl d)
+        if B_proj_bias is not None:
+            B = B + B_proj_bias.to(dtype=B.dtype)
+        if not A.is_complex():
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            B = rearrange(B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    if C is None:  # variable B
+        C = x_dbl[:, -d_state:]  # (bl d)
+        if C_proj_bias is not None:
+            C = C + C_proj_bias.to(dtype=C.dtype)
+        if not A.is_complex():
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            C = rearrange(C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    y = selective_scan_fn(x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True)
+    y_b = selective_scan_fn(x.flip([-1]), delta.flip([-1]), A_b, B.flip([-1]), C.flip([-1]), D, z.flip([-1]), delta_bias, delta_softplus=True)
+    y = y + y_b.flip([-1])
+    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)
diff --git a/mamba/mamba_ssm/ops/triton/__init__.py b/mamba/mamba_ssm/ops/triton/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mamba/mamba_ssm/ops/triton/layernorm.py b/mamba/mamba_ssm/ops/triton/layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d57397f6e5af1138e8df62629f9ab57174f6f4
--- /dev/null
+++ b/mamba/mamba_ssm/ops/triton/layernorm.py
@@ -0,0 +1,636 @@
+# Copyright (c) 2023, Tri Dao.
+# Implement residual + layer_norm / rms_norm.
+
+# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
+# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
+# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from torch.cuda.amp import custom_fwd, custom_bwd
+
+import triton
+import triton.language as tl
+
+
+def layer_norm_ref(x, weight, bias, residual=None, eps=1e-6, prenorm=False, upcast=False):
+    dtype = x.dtype
+    if upcast:
+        weight = weight.float()
+        bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        residual = residual.float() if residual is not None else residual
+    if residual is not None:
+        x = (x + residual).to(x.dtype)
+    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
+        dtype
+    )
+    return out if not prenorm else (out, x)
+
+
+def rms_norm_ref(x, weight, bias, residual=None, eps=1e-6, prenorm=False, upcast=False):
+    dtype = x.dtype
+    if upcast:
+        weight = weight.float()
+        bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        residual = residual.float() if residual is not None else residual
+    if residual is not None:
+        x = (x + residual).to(x.dtype)
+    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+    out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
+    out = out.to(dtype)
+    return out if not prenorm else (out, x)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
+)
+# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
+@triton.jit
+def _layer_norm_fwd_1pass_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    RESIDUAL,  # pointer to the residual
+    RESIDUAL_OUT,  # pointer to the residual
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_res_row,
+    stride_res_out_row,
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    IS_RMS_NORM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+    STORE_RESIDUAL_OUT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    X += row * stride_x_row
+    Y += row * stride_y_row
+    if HAS_RESIDUAL:
+        RESIDUAL += row * stride_res_row
+    if STORE_RESIDUAL_OUT:
+        RESIDUAL_OUT += row * stride_res_out_row
+    # Compute mean and variance
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+    if HAS_RESIDUAL:
+        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
+        x += residual
+    if STORE_RESIDUAL_OUT:
+        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=0) / N
+        tl.store(Mean + row, mean)
+        xbar = tl.where(cols < N, x - mean, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    else:
+        xbar = tl.where(cols < N, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+
+
+def _layer_norm_fwd(
+    x, weight, bias, eps, residual=None, out_dtype=None, residual_dtype=None, is_rms_norm=False
+):
+    if residual is not None:
+        residual_dtype = residual.dtype
+    M, N = x.shape
+    assert x.stride(-1) == 1
+    if residual is not None:
+        assert residual.stride(-1) == 1
+        assert residual.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
+    assert y.stride(-1) == 1
+    if residual is not None or (residual_dtype is not None and residual_dtype != x.dtype):
+        residual_out = torch.empty(M, N, device=x.device, dtype=residual_dtype)
+        assert residual_out.stride(-1) == 1
+    else:
+        residual_out = None
+    mean = torch.empty((M,), dtype=torch.float32, device="cuda") if not is_rms_norm else None
+    rstd = torch.empty((M,), dtype=torch.float32, device="cuda")
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    if N > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[(M,)](
+            x,
+            y,
+            weight,
+            bias,
+            residual,
+            residual_out,
+            mean,
+            rstd,
+            x.stride(0),
+            y.stride(0),
+            residual.stride(0) if residual is not None else 0,
+            residual_out.stride(0) if residual_out is not None else 0,
+            N,
+            eps,
+            is_rms_norm,
+            BLOCK_N,
+            residual is not None,
+            residual_out is not None,
+            bias is not None,
+        )
+    # residual_out is None if residual is None and residual_dtype == input_dtype
+    return y, mean, rstd, residual_out if residual_out is not None else x
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS"],
+)
+# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
+# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
+@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
+@triton.jit
+def _layer_norm_bwd_kernel(
+    X,  # pointer to the input
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Y,  # pointer to the output to be recomputed
+    DY,  # pointer to the output gradient
+    DX,  # pointer to the input gradient
+    DW,  # pointer to the partial sum of weights gradient
+    DB,  # pointer to the partial sum of biases gradient
+    DRESIDUAL,
+    DRESIDUAL_IN,
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_dy_row,
+    stride_dx_row,
+    stride_dres_row,
+    stride_dres_in_row,
+    M,  # number of rows in X
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    rows_per_program,
+    IS_RMS_NORM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_DRESIDUAL: tl.constexpr,
+    STORE_DRESIDUAL: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    RECOMPUTE_OUTPUT: tl.constexpr,
+):
+    # Map the program id to the elements of X, DX, and DY it should compute.
+    row_block_id = tl.program_id(0)
+    row_start = row_block_id * rows_per_program
+    cols = tl.arange(0, BLOCK_N)
+    mask = cols < N
+    X += row_start * stride_x_row
+    if HAS_DRESIDUAL:
+        DRESIDUAL += row_start * stride_dres_row
+    if STORE_DRESIDUAL:
+        DRESIDUAL_IN += row_start * stride_dres_in_row
+    DY += row_start * stride_dy_row
+    DX += row_start * stride_dx_row
+    if RECOMPUTE_OUTPUT:
+        Y += row_start * stride_y_row
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if RECOMPUTE_OUTPUT and HAS_BIAS:
+        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
+    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAS_BIAS:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    row_end = min((row_block_id + 1) * rows_per_program, M)
+    for row in range(row_start, row_end):
+        # Load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        if not IS_RMS_NORM:
+            mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+        # Compute dx
+        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+        xhat = tl.where(mask, xhat, 0.0)
+        if RECOMPUTE_OUTPUT:
+            y = xhat * w + b if HAS_BIAS else xhat * w
+            tl.store(Y + cols, y, mask=mask)
+        wdy = w * dy
+        dw += dy * xhat
+        if HAS_BIAS:
+            db += dy
+        if not IS_RMS_NORM:
+            c1 = tl.sum(xhat * wdy, axis=0) / N
+            c2 = tl.sum(wdy, axis=0) / N
+            dx = (wdy - (xhat * c1 + c2)) * rstd
+        else:
+            c1 = tl.sum(xhat * wdy, axis=0) / N
+            dx = (wdy - xhat * c1) * rstd
+        if HAS_DRESIDUAL:
+            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
+            dx += dres
+        # Write dx
+        if STORE_DRESIDUAL:
+            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
+        tl.store(DX + cols, dx, mask=mask)
+
+        X += stride_x_row
+        if HAS_DRESIDUAL:
+            DRESIDUAL += stride_dres_row
+        if STORE_DRESIDUAL:
+            DRESIDUAL_IN += stride_dres_in_row
+        if RECOMPUTE_OUTPUT:
+            Y += stride_y_row
+        DY += stride_dy_row
+        DX += stride_dx_row
+    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
+    if HAS_BIAS:
+        tl.store(DB + row_block_id * N + cols, db, mask=mask)
+
+
+def _layer_norm_bwd(
+    dy,
+    x,
+    weight,
+    bias,
+    eps,
+    mean,
+    rstd,
+    dresidual=None,
+    has_residual=False,
+    is_rms_norm=False,
+    x_dtype=None,
+    recompute_output=False,
+):
+    M, N = x.shape
+    assert x.stride(-1) == 1
+    assert dy.stride(-1) == 1
+    assert dy.shape == (M, N)
+    if dresidual is not None:
+        assert dresidual.stride(-1) == 1
+        assert dresidual.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    dx = (
+        torch.empty_like(x)
+        if x_dtype is None
+        else torch.empty(M, N, dtype=x_dtype, device=x.device)
+    )
+    dresidual_in = torch.empty_like(x) if has_residual and dx.dtype != x.dtype else None
+    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    if N > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
+    _db = (
+        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
+        if bias is not None
+        else None
+    )
+    rows_per_program = math.ceil(M / sm_count)
+    grid = (sm_count,)
+    with torch.cuda.device(x.device.index):
+        _layer_norm_bwd_kernel[grid](
+            x,
+            weight,
+            bias,
+            y,
+            dy,
+            dx,
+            _dw,
+            _db,
+            dresidual,
+            dresidual_in,
+            mean,
+            rstd,
+            x.stride(0),
+            0 if not recompute_output else y.stride(0),
+            dy.stride(0),
+            dx.stride(0),
+            dresidual.stride(0) if dresidual is not None else 0,
+            dresidual_in.stride(0) if dresidual_in is not None else 0,
+            M,
+            N,
+            eps,
+            rows_per_program,
+            is_rms_norm,
+            BLOCK_N,
+            dresidual is not None,
+            dresidual_in is not None,
+            bias is not None,
+        )
+    dw = _dw.sum(0).to(weight.dtype)
+    db = _db.sum(0).to(bias.dtype) if bias is not None else None
+    # Don't need to compute dresidual_in separately in this case
+    if has_residual and dx.dtype == x.dtype:
+        dresidual_in = dx
+    return (dx, dw, db, dresidual_in) if not recompute_output else (dx, dw, db, dresidual_in, y)
+
+
+class LayerNormFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        residual=None,
+        eps=1e-6,
+        prenorm=False,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    ):
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if residual is not None:
+            assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
+        weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        residual_dtype = (
+            residual.dtype
+            if residual is not None
+            else (torch.float32 if residual_in_fp32 else None)
+        )
+        y, mean, rstd, residual_out = _layer_norm_fwd(
+            x, weight, bias, eps, residual, residual_dtype=residual_dtype, is_rms_norm=is_rms_norm
+        )
+        ctx.save_for_backward(residual_out, weight, bias, mean, rstd)
+        ctx.x_shape_og = x_shape_og
+        ctx.eps = eps
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_residual = residual is not None
+        ctx.prenorm = prenorm
+        ctx.x_dtype = x.dtype
+        y = y.reshape(x_shape_og)
+        return y if not prenorm else (y, residual_out.reshape(x_shape_og))
+
+    @staticmethod
+    def backward(ctx, dy, *args):
+        x, weight, bias, mean, rstd = ctx.saved_tensors
+        dy = dy.reshape(-1, dy.shape[-1])
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
+        if ctx.prenorm:
+            dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
+            assert dresidual.shape == x.shape
+        else:
+            dresidual = None
+        dx, dw, db, dresidual_in = _layer_norm_bwd(
+            dy,
+            x,
+            weight,
+            bias,
+            ctx.eps,
+            mean,
+            rstd,
+            dresidual,
+            ctx.has_residual,
+            ctx.is_rms_norm,
+            x_dtype=ctx.x_dtype,
+        )
+        return (
+            dx.reshape(ctx.x_shape_og),
+            dw,
+            db,
+            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def layer_norm_fn(
+    x,
+    weight,
+    bias,
+    residual=None,
+    eps=1e-6,
+    prenorm=False,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    return LayerNormFn.apply(x, weight, bias, residual, eps, prenorm, residual_in_fp32, is_rms_norm)
+
+
+def rms_norm_fn(x, weight, bias, residual=None, prenorm=False, residual_in_fp32=False, eps=1e-6):
+    return LayerNormFn.apply(x, weight, bias, residual, eps, prenorm, residual_in_fp32, True)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+
+    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
+        return rms_norm_fn(
+            x,
+            self.weight,
+            self.bias,
+            residual=residual,
+            eps=self.eps,
+            prenorm=prenorm,
+            residual_in_fp32=residual_in_fp32,
+            # is_rms_norm=True,
+        )
+
+
+class LayerNormLinearFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        x,
+        norm_weight,
+        norm_bias,
+        linear_weight,
+        linear_bias,
+        residual=None,
+        eps=1e-6,
+        prenorm=False,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    ):
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if residual is not None:
+            assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
+        norm_weight = norm_weight.contiguous()
+        if norm_bias is not None:
+            norm_bias = norm_bias.contiguous()
+        residual_dtype = (
+            residual.dtype
+            if residual is not None
+            else (torch.float32 if residual_in_fp32 else None)
+        )
+        y, mean, rstd, residual_out = _layer_norm_fwd(
+            x,
+            norm_weight,
+            norm_bias,
+            eps,
+            residual,
+            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(),
+            residual_dtype=residual_dtype,
+            is_rms_norm=is_rms_norm,
+        )
+        y = y.reshape(x_shape_og)
+        dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
+        linear_weight = linear_weight.to(dtype)
+        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
+        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
+        # We don't store y, will be recomputed in the backward pass to save memory
+        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
+        ctx.x_shape_og = x_shape_og
+        ctx.eps = eps
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_residual = residual is not None
+        ctx.prenorm = prenorm
+        ctx.x_dtype = x.dtype
+        ctx.linear_bias_is_none = linear_bias is None
+        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout, *args):
+        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
+        dout = dout.reshape(-1, dout.shape[-1])
+        dy = F.linear(dout, linear_weight.t())
+        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
+        if ctx.prenorm:
+            dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
+            assert dresidual.shape == x.shape
+        else:
+            dresidual = None
+        dx, dnorm_weight, dnorm_bias, dresidual_in, y = _layer_norm_bwd(
+            dy,
+            x,
+            norm_weight,
+            norm_bias,
+            ctx.eps,
+            mean,
+            rstd,
+            dresidual,
+            ctx.has_residual,
+            ctx.is_rms_norm,
+            x_dtype=ctx.x_dtype,
+            recompute_output=True,
+        )
+        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
+        return (
+            dx.reshape(ctx.x_shape_og),
+            dnorm_weight,
+            dnorm_bias,
+            dlinear_weight,
+            dlinear_bias,
+            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def layer_norm_linear_fn(
+    x,
+    norm_weight,
+    norm_bias,
+    linear_weight,
+    linear_bias,
+    residual=None,
+    eps=1e-6,
+    prenorm=False,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    return LayerNormLinearFn.apply(
+        x,
+        norm_weight,
+        norm_bias,
+        linear_weight,
+        linear_bias,
+        residual,
+        eps,
+        prenorm,
+        residual_in_fp32,
+        is_rms_norm,
+    )
diff --git a/mamba/mamba_ssm/ops/triton/selective_state_update.py b/mamba/mamba_ssm/ops/triton/selective_state_update.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa95de73f173292914c5f00fbe9426937d00e502
--- /dev/null
+++ b/mamba/mamba_ssm/ops/triton/selective_state_update.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2023, Tri Dao.
+
+"""We want triton==2.1.0 for this
+"""
+
+import math
+import torch
+import torch.nn.functional as F
+
+import triton
+import triton.language as tl
+
+from einops import rearrange, repeat
+
+
+@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
+@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
+@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
+@triton.heuristics({"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])})
+@triton.jit
+def _selective_scan_update_kernel(
+    # Pointers to matrices
+    state_ptr, x_ptr, dt_ptr, dt_bias_ptr, A_ptr, B_ptr, C_ptr, D_ptr, z_ptr, out_ptr,
+    # Matrix dimensions
+    batch, dim, dstate,
+    # Strides
+    stride_state_batch, stride_state_dim, stride_state_dstate,
+    stride_x_batch, stride_x_dim,
+    stride_dt_batch, stride_dt_dim,
+    stride_dt_bias_dim,
+    stride_A_dim, stride_A_dstate,
+    stride_B_batch, stride_B_dstate,
+    stride_C_batch, stride_C_dstate,
+    stride_D_dim,
+    stride_z_batch, stride_z_dim,
+    stride_out_batch, stride_out_dim,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    HAS_D: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+):
+    pid_m = tl.program_id(axis=0)
+    pid_b = tl.program_id(axis=1)
+    state_ptr += pid_b * stride_state_batch
+    x_ptr += pid_b * stride_x_batch
+    dt_ptr += pid_b * stride_dt_batch
+    B_ptr += pid_b * stride_B_batch
+    C_ptr += pid_b * stride_C_batch
+    if HAS_Z:
+        z_ptr += pid_b * stride_z_batch
+    out_ptr += pid_b * stride_out_batch
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
+    state_ptrs = state_ptr + (offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate)
+    x_ptrs = x_ptr + offs_m * stride_x_dim
+    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+    if HAS_DT_BIAS:
+        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
+    A_ptrs = A_ptr + (offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate)
+    B_ptrs = B_ptr + offs_n * stride_B_dstate
+    C_ptrs = C_ptr + offs_n * stride_C_dstate
+    if HAS_D:
+        D_ptrs = D_ptr + offs_m * stride_D_dim
+    if HAS_Z:
+        z_ptrs = z_ptr + offs_m * stride_z_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+
+    state = tl.load(state_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0)
+    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if HAS_DT_BIAS:
+        dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if DT_SOFTPLUS:
+        dt = tl.log(1.0 + tl.exp(dt))
+    A = tl.load(A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
+    dA = tl.exp(A * dt[:, None])
+    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+    if HAS_D:
+        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if HAS_Z:
+        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+
+    dB = B[None, :] * dt[:, None]
+    state = state * dA + dB * x[:, None]
+    tl.store(state_ptrs, state, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate))
+    out = tl.sum(state * C[None, :], axis=1)
+    if HAS_D:
+        out += x * D
+    if HAS_Z:
+        out *= z * tl.sigmoid(z)
+    tl.store(out_ptrs, out, mask=offs_m < dim)
+
+
+def selective_state_update(state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False):
+    """
+    Argument:
+        state: (batch, dim, dstate)
+        x: (batch, dim)
+        dt: (batch, dim)
+        A: (dim, dstate)
+        B: (batch, dstate)
+        C: (batch, dstate)
+        D: (dim,)
+        z: (batch, dim)
+        dt_bias: (dim,)
+    Return:
+        out: (batch, dim)
+    """
+    batch, dim, dstate = state.shape
+    assert x.shape == (batch, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (dim, dstate)
+    assert B.shape == (batch, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (dim,)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (dim,)
+    out = torch.empty_like(x)
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch)
+    z_strides = ((z.stride(0), z.stride(1)) if z is not None else (0, 0))
+    # We don't want autotune since it will overwrite the state
+    # We instead tune by hand.
+    BLOCK_SIZE_M, num_warps = ((32, 4) if dstate <= 16
+                               else ((16, 4) if dstate <= 32 else
+                                     ((8, 4) if dstate <= 64 else
+                                      ((4, 4) if dstate <= 128 else
+                                       ((4, 8))))))
+    with torch.cuda.device(x.device.index):
+        _selective_scan_update_kernel[grid](
+            state, x, dt, dt_bias, A, B, C, D, z, out,
+            batch, dim, dstate,
+            state.stride(0), state.stride(1), state.stride(2),
+            x.stride(0), x.stride(1),
+            dt.stride(0), dt.stride(1),
+            dt_bias.stride(0) if dt_bias is not None else 0,
+            A.stride(0), A.stride(1),
+            B.stride(0), B.stride(1),
+            C.stride(0), C.stride(1),
+            D.stride(0) if D is not None else 0,
+            z_strides[0], z_strides[1],
+            out.stride(0), out.stride(1),
+            dt_softplus,
+            BLOCK_SIZE_M,
+            num_warps=num_warps,
+        )
+    return out
+
+
+def selective_state_update_ref(state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False):
+    """
+    Argument:
+        state: (batch, dim, dstate)
+        x: (batch, dim)
+        dt: (batch, dim)
+        A: (dim, dstate)
+        B: (batch, dstate)
+        C: (batch, dstate)
+        D: (dim,)
+        z: (batch, dim)
+        dt_bias: (dim,)
+    Return:
+        out: (batch, dim)
+    """
+    batch, dim, dstate = state.shape
+    assert x.shape == (batch, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (dim, dstate)
+    assert B.shape == (batch, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (dim,)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (dim,)
+        dt = dt + dt_bias
+    dt = F.softplus(dt) if dt_softplus else dt
+    dA = torch.exp(rearrange(dt, "b d -> b d 1") * A)  # (batch, dim, dstate)
+    dB = rearrange(dt, "b d -> b d 1") * rearrange(B, "b n -> b 1 n")  # (batch, dim, dstate)
+    state.copy_(state * dA + dB * rearrange(x, "b d -> b d 1"))  # (batch, dim, dstate
+    out = torch.einsum("bdn,bn->bd", state.to(C.dtype), C)
+    if D is not None:
+        out += (x * D).to(out.dtype)
+    return (out if z is None else out * F.silu(z)).to(x.dtype)
diff --git a/mamba/mamba_ssm/utils/__init__.py b/mamba/mamba_ssm/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mamba/mamba_ssm/utils/generation.py b/mamba/mamba_ssm/utils/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d766b29ac28a388a7d77b22aa2cb1eda733c0f4
--- /dev/null
+++ b/mamba/mamba_ssm/utils/generation.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2023, Albert Gu, Tri Dao.
+import gc
+import time
+from collections import namedtuple
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Callable, Optional, Sequence, Union
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.profiler import ProfilerActivity, profile, record_function
+from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput
+
+
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+
+
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf. Done in-place."""
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+
+
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf. Done in-place."""
+    if top_p <= 0.0 or top_p >= 1.0:
+        return
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        1, sorted_indices, sorted_indices_to_remove
+    )
+    logits.masked_fill_(indices_to_remove, float("-inf"))
+
+
+def sample(logits, top_k=1, top_p=0.0, temperature=1.0):
+    """Sample from top-k logits.
+    Arguments:
+        logits: Tensor of shape (batch_size, vocab_size)
+    """
+    if top_k == 1:  # Short-circuit for greedy decoding
+        return logits.argmax(dim=-1)
+    else:
+        if top_p > 0.0:
+            assert top_p <= 1.0, "top-p should be in (0, 1]."
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))  # Safety check
+            logits_top, indices = torch.topk(logits, top_k, dim=-1)
+            if temperature != 1.0:
+                logits_top /= temperature
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return indices[
+                torch.arange(indices.shape[0], device=indices.device),
+                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
+            ]
+        else:
+            # Clone so that when we modify for top_p we don't change the original logits
+            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(
+                dim=-1
+            )
+
+
+@torch.inference_mode()
+def decode(
+    input_ids,
+    model,
+    max_length,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    eos_token_id=None,
+    teacher_outputs=None,
+    vocab_size=None,
+    tensor_parallel=1,
+    cg=False,
+    enable_timing=False,
+):
+    """Decoding, either greedy or with top-k or top-p sampling.
+    If top-k = 0, don't limit the number of candidates (pure sampling).
+    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
+    then top-p.
+    We assume that all sequences in the same batch have the same length.
+
+    Arguments:
+        input_ids: (batch, seq_len)
+        max_length: int
+        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
+            logits, the next token is taken from the teacher_outputs. Useful for testing.
+    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
+        sequences: (batch, max_length)
+        scores: tuples of (batch, vocab_size)
+    """
+    batch_size, seqlen_og = input_ids.shape
+    teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0
+    if cg:
+        if not hasattr(model, "_decoding_cache"):
+            model._decoding_cache = None
+        model._decoding_cache = update_graph_cache(
+            model,
+            model._decoding_cache,
+            batch_size,
+            seqlen_og,
+            max_length,
+            tensor_parallel=tensor_parallel,
+        )
+        inference_params = model._decoding_cache.inference_params
+        inference_params.reset(max_length, batch_size)
+    else:
+        inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
+
+    def get_logits(input_ids, inference_params):
+        decoding = inference_params.seqlen_offset > 0
+        if decoding:
+            position_ids = torch.full(
+                (batch_size, 1),
+                inference_params.seqlen_offset,
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        else:
+            position_ids = None
+        if not cg or not decoding:
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=1,
+            ).logits.squeeze(dim=1)
+        else:
+            logits = model._decoding_cache.run(
+                input_ids, position_ids, inference_params.seqlen_offset
+            ).squeeze(dim=1)
+        return logits[..., :vocab_size] if vocab_size is not None else logits
+
+    def sample_tokens(logits, inference_params):
+        if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset:
+            token = sample(logits, top_k=top_k, top_p=top_p, temperature=temperature)
+        else:
+            token = teacher_outputs[:, inference_params.seqlen_offset]
+        # return rearrange(token, "b -> b 1")
+        return token.unsqueeze(1)
+
+    def should_stop(current_token, inference_params):
+        if inference_params.seqlen_offset == 0:
+            return False
+        if eos_token_id is not None and (current_token == eos_token_id).all():
+            return True
+        if inference_params.seqlen_offset >= max_length - 1:
+            return True
+        return False
+
+    start = torch.cuda.Event(enable_timing=enable_timing)
+    end = torch.cuda.Event(enable_timing=enable_timing)
+
+    if enable_timing:
+        if tensor_parallel > 1:
+            torch.distributed.barrier()
+        start.record()
+    scores, sequences = [], [input_ids]
+    while not should_stop(sequences[-1], inference_params):
+        scores.append(get_logits(sequences[-1], inference_params))
+        inference_params.seqlen_offset += sequences[-1].shape[1]
+        sequences.append(sample_tokens(scores[-1], inference_params))
+    if enable_timing:
+        end.record()
+        if tensor_parallel > 1:
+            torch.distributed.barrier()
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms")
+    output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
+    return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores))
+
+
+class GenerationMixin:
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        raise NotImplementedError
+
+    def generate(
+        self,
+        input_ids,
+        max_length,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+        return_dict_in_generate=False,
+        output_scores=False,
+        **kwargs,
+    ):
+        output = decode(
+            input_ids, self, max_length, top_k=top_k, top_p=top_p, temperature=temperature, **kwargs
+        )
+        if not output_scores:
+            output.scores = None
+        return output if return_dict_in_generate else output.sequences
+
+
+def allocate_inference_cache(
+    max_batch_size,
+    max_seqlen,
+    nheads,
+    headdim,
+    layers: Union[int, Sequence],
+    device,
+    dtype=torch.float16,
+):
+    assert dtype in [torch.float16, torch.bfloat16, torch.float32]
+    kv_cache_shape = (max_batch_size, max_seqlen, 2, nheads, headdim)
+    if isinstance(layers, int):
+        layers = range(layers)
+    return {i: torch.empty(kv_cache_shape, device=device, dtype=dtype) for i in layers}
+
+
+@dataclass
+class DecodingCGCache:
+    max_batch_size: int = 0
+    max_seqlen: int = 0
+    device = None
+    dtype = None
+    callables: dict = field(default_factory=dict)
+    mempool = None
+    inference_params: Optional[InferenceParams] = None
+    run: Optional[Callable] = None
+
+
+@torch.inference_mode()
+def update_graph_cache(
+    model,
+    cache,
+    batch_size,
+    seqlen_og,
+    max_seqlen,
+    decoding_seqlens=(1,),
+    tensor_parallel=1,
+    dtype=None,
+    n_warmups=2,
+):
+    if cache is None:
+        cache = DecodingCGCache()
+    param_example = next(iter(model.parameters()))
+    device = param_example.device
+    if dtype is None:
+        dtype = param_example.dtype
+    if (
+        (device, dtype) != (cache.device, cache.dtype)
+        or batch_size > cache.max_batch_size
+        or max_seqlen > cache.max_seqlen
+    ):  # Invalidate the cache
+        cache.callables = {}
+        cache.mempool = None
+        cache.inference_params = None
+        gc.collect()
+        cache.device, cache.dtype = device, dtype
+        cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen
+        if hasattr(model, "allocate_inference_cache"):
+            inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
+        else:
+            headdim = getattr(
+                model.config,
+                "head_dim",
+                model.config.hidden_size // model.config.num_attention_heads,
+            )
+            inf_cache = allocate_inference_cache(
+                batch_size,
+                max_seqlen,
+                model.config.num_attention_heads // tensor_parallel,
+                headdim,
+                model.config.num_hidden_layers,
+                device,
+                dtype,
+            )
+        lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device)
+        cache.inference_params = InferenceParams(
+            max_seqlen=max_seqlen,
+            max_batch_size=batch_size,
+            seqlen_offset=seqlen_og,
+            key_value_memory_dict=inf_cache,
+            lengths_per_sample=lengths_per_sample,
+        )
+        cache.mempool = torch.cuda.graphs.graph_pool_handle()
+    for decoding_seqlen in decoding_seqlens:
+        if (batch_size, decoding_seqlen) not in cache.callables:
+            cache.callables[batch_size, decoding_seqlen] = capture_graph(
+                model,
+                cache.inference_params,
+                batch_size,
+                max_seqlen,
+                decoding_seqlen=decoding_seqlen,
+                mempool=cache.mempool,
+                n_warmups=n_warmups,
+            )
+
+    def dispatch(input_ids, position_ids, seqlen):
+        batch_size, decoding_seqlen = input_ids.shape[:2]
+        return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen)
+
+    cache.run = dispatch
+    cache.inference_params.seqlen_offset = 0  # Reset so it's not confusing
+    return cache
+
+
+def capture_graph(
+    model, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2
+):
+    device = next(iter(model.parameters())).device
+    input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    seqlen_offset_og = inference_params.seqlen_offset
+    inference_params.seqlen_offset = max_seqlen - decoding_seqlen
+    inference_params.lengths_per_sample[:] = inference_params.seqlen_offset
+
+    # Warmup before capture
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(n_warmups):
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=decoding_seqlen,
+            ).logits
+        s.synchronize()
+        # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0,
+        # which requires that graph launch and non-captured launch to not overlap (I think,
+        # that's how I interpret the documentation). I'm not sure if this is required.
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+    torch.cuda.current_stream().wait_stream(s)
+    # Captures the graph
+    # To allow capture, automatically sets a side stream as the current stream in the context
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, pool=mempool):
+        logits = model(
+            input_ids,
+            position_ids=position_ids,
+            inference_params=inference_params,
+            num_last_tokens=decoding_seqlen,
+        ).logits
+
+    def run(new_input_ids, new_position_ids, seqlen):
+        inference_params.lengths_per_sample[:] = seqlen
+        input_ids.copy_(new_input_ids)
+        position_ids.copy_(new_position_ids)
+        graph.replay()
+        return logits.clone()
+
+    inference_params.seqlen_offset = seqlen_offset_og
+    return run
diff --git a/mamba/mamba_ssm/utils/hf.py b/mamba/mamba_ssm/utils/hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d7555acddbd260636d1d14d5bd6324f6af0056a
--- /dev/null
+++ b/mamba/mamba_ssm/utils/hf.py
@@ -0,0 +1,23 @@
+import json
+
+import torch
+
+from transformers.utils import WEIGHTS_NAME, CONFIG_NAME
+from transformers.utils.hub import cached_file
+
+
+def load_config_hf(model_name):
+    resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False)
+    return json.load(open(resolved_archive_file))
+
+
+def load_state_dict_hf(model_name, device=None, dtype=None):
+    # If not fp32, then we don't want to load directly to the GPU
+    mapped_device = "cpu" if dtype not in [torch.float32, None] else device
+    resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False)
+    return torch.load(resolved_archive_file, map_location=mapped_device)
+    # Convert dtype before moving to GPU to save memory
+    if dtype is not None:
+        state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
+    state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
+    return state_dict
diff --git a/mamba/setup.py b/mamba/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce0ac045f8b2ae07f39f3d045e997ab362ec4c1
--- /dev/null
+++ b/mamba/setup.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2023, Albert Gu, Tri Dao.
+import sys
+import warnings
+import os
+import re
+import ast
+from pathlib import Path
+from packaging.version import parse, Version
+import platform
+import shutil
+
+from setuptools import setup, find_packages
+import subprocess
+
+import urllib.request
+import urllib.error
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
+import torch
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+    CUDAExtension,
+    CUDA_HOME,
+)
+
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+
+PACKAGE_NAME = "mamba_ssm"
+
+BASE_WHEEL_URL = "https://github.com/state-spaces/mamba/releases/download/{tag_name}/{wheel_name}"
+
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("MAMBA_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("MAMBA_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("MAMBA_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+
+
+def get_platform():
+    """
+    Returns the platform name as used in wheel filenames.
+    """
+    if sys.platform.startswith("linux"):
+        return "linux_x86_64"
+    elif sys.platform == "darwin":
+        mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
+        return f"macosx_{mac_version}_x86_64"
+    elif sys.platform == "win32":
+        return "win_amd64"
+    else:
+        raise ValueError("Unsupported platform: {}".format(sys.platform))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+
+    return raw_output, bare_metal_version
+
+
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+
+
+def append_nvcc_threads(nvcc_extra_args):
+    return nvcc_extra_args + ["--threads", "4"]
+
+
+cmdclass = {}
+ext_modules = []
+
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+    check_if_cuda_home_none(PACKAGE_NAME)
+    # Check, if CUDA11 is installed for compute capability 8.0
+    cc_flag = []
+    if CUDA_HOME is not None:
+        _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+        if bare_metal_version < Version("11.6"):
+            raise RuntimeError(
+                f"{PACKAGE_NAME} is only supported on CUDA 11.6 and above.  "
+                "Note: make sure nvcc has a supported version by running nvcc -V."
+            )
+
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_70,code=sm_70")
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_80,code=sm_80")
+    if bare_metal_version >= Version("11.8"):
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_90,code=sm_90")
+
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+
+    ext_modules.append(
+        CUDAExtension(
+            name="selective_scan_cuda",
+            sources=[
+                "csrc/selective_scan/selective_scan.cpp",
+                "csrc/selective_scan/selective_scan_fwd_fp32.cu",
+                "csrc/selective_scan/selective_scan_fwd_fp16.cu",
+                "csrc/selective_scan/selective_scan_fwd_bf16.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp32_real.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp32_complex.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp16_real.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp16_complex.cu",
+                "csrc/selective_scan/selective_scan_bwd_bf16_real.cu",
+                "csrc/selective_scan/selective_scan_bwd_bf16_complex.cu",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17"],
+                "nvcc": append_nvcc_threads(
+                    [
+                        "-O3",
+                        "-std=c++17",
+                        "-U__CUDA_NO_HALF_OPERATORS__",
+                        "-U__CUDA_NO_HALF_CONVERSIONS__",
+                        "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                        "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+                        "--expt-relaxed-constexpr",
+                        "--expt-extended-lambda",
+                        "--use_fast_math",
+                        "--ptxas-options=-v",
+                        "-lineinfo",
+                    ]
+                    + cc_flag
+                ),
+            },
+            include_dirs=[Path(this_dir) / "csrc" / "selective_scan"],
+        )
+    )
+
+
+def get_package_version():
+    with open(Path(this_dir) / PACKAGE_NAME / "__init__.py", "r") as f:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+    public_version = ast.literal_eval(version_match.group(1))
+    local_version = os.environ.get("MAMBA_LOCAL_VERSION")
+    if local_version:
+        return f"{public_version}+{local_version}"
+    else:
+        return str(public_version)
+
+
+def get_wheel_url():
+    # Determine the version numbers that will be used to determine the correct wheel
+    # We're using the CUDA version used to build torch, not the one currently installed
+    # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
+    torch_cuda_version = parse(torch.version.cuda)
+    torch_version_raw = parse(torch.__version__)
+    # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.2
+    # to save CI time. Minor versions should be compatible.
+    torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.2")
+    python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+    platform_name = get_platform()
+    mamba_ssm_version = get_package_version()
+    # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
+    cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
+    torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}"
+    cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()
+
+    # Determine wheel URL based on CUDA version, torch version, python version and OS
+    wheel_filename = f"{PACKAGE_NAME}-{mamba_ssm_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl"
+    wheel_url = BASE_WHEEL_URL.format(
+        tag_name=f"v{mamba_ssm_version}", wheel_name=wheel_filename
+    )
+    return wheel_url, wheel_filename
+
+
+class CachedWheelsCommand(_bdist_wheel):
+    """
+    The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
+    find an existing wheel (which is currently the case for all installs). We use
+    the environment parameters to detect whether there is already a pre-built version of a compatible
+    wheel available and short-circuits the standard full build pipeline.
+    """
+
+    def run(self):
+        if FORCE_BUILD:
+            return super().run()
+
+        wheel_url, wheel_filename = get_wheel_url()
+        print("Guessing wheel URL: ", wheel_url)
+        try:
+            urllib.request.urlretrieve(wheel_url, wheel_filename)
+
+            # Make the archive
+            # Lifted from the root wheel processing command
+            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
+            if not os.path.exists(self.dist_dir):
+                os.makedirs(self.dist_dir)
+
+            impl_tag, abi_tag, plat_tag = self.get_tag()
+            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
+
+            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
+            print("Raw wheel path", wheel_path)
+            shutil.move(wheel_filename, wheel_path)
+        except urllib.error.HTTPError:
+            print("Precompiled wheel not found. Building from source...")
+            # If the wheel could not be downloaded, build from source
+            super().run()
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=get_package_version(),
+    packages=find_packages(
+        exclude=(
+            "build",
+            "csrc",
+            "include",
+            "tests",
+            "dist",
+            "docs",
+            "benchmarks",
+            "mamba_ssm.egg-info",
+        )
+    ),
+    author="Tri Dao, Albert Gu",
+    author_email="tri@tridao.me, agu@cs.cmu.edu",
+    description="Mamba state-space model",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/state-spaces/mamba",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: Unix",
+    ],
+    ext_modules=ext_modules,
+    cmdclass={"bdist_wheel": CachedWheelsCommand, "build_ext": BuildExtension}
+    if ext_modules
+    else {
+        "bdist_wheel": CachedWheelsCommand,
+    },
+    python_requires=">=3.7",
+    install_requires=[
+        "torch",
+        "packaging",
+        "ninja",
+        "einops",
+        "triton",
+        "transformers",
+        "causal_conv1d",
+    ],
+)
diff --git a/mamba/test_mamba_module.py b/mamba/test_mamba_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..64710e92f7ec4fc0fe88821550e4ecf902a22bfe
--- /dev/null
+++ b/mamba/test_mamba_module.py
@@ -0,0 +1,15 @@
+import torch
+from mamba_ssm import Mamba
+
+batch, length, dim = 2, 64, 768
+x = torch.randn(batch, length, dim).to("cuda")
+model = Mamba(
+    # This module uses roughly 3 * expand * d_model^2 parameters
+    d_model=dim, # Model dimension d_model
+    d_state=16,  # SSM state expansion factor # 64
+    d_conv=4,    # Local convolution width
+    expand=2,    # Block expansion factor
+    use_fast_path=False,
+).to("cuda")
+y = model(x)
+assert y.shape == x.shape
diff --git a/mamba/tests/ops/test_selective_scan.py b/mamba/tests/ops/test_selective_scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..26b34a37560f08ced653a1d9320a14f3d3f9ebd3
--- /dev/null
+++ b/mamba/tests/ops/test_selective_scan.py
@@ -0,0 +1,423 @@
+# Copyright (C) 2023, Tri Dao.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import gradcheck
+import pytest
+
+from einops import rearrange
+
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, selective_scan_ref
+from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, mamba_inner_ref
+from mamba_ssm.ops.selective_scan_interface import bimamba_inner_fn, bimamba_inner_ref
+
+
+# @pytest.mark.parametrize('wtype', [torch.float32, torch.complex64])
+@pytest.mark.parametrize('wtype', [torch.float32])
+# @pytest.mark.parametrize('itype', [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.float32])
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 372, 512, 784, 1024, 1134, 2048, 4096])
+@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
+# @pytest.mark.parametrize('seqlen', [128])
+# @pytest.mark.parametrize("return_last_state", [False, True])
+@pytest.mark.parametrize("return_last_state", [True])
+# @pytest.mark.parametrize('has_delta_bias', [False, True])
+@pytest.mark.parametrize('has_delta_bias', [True])
+# @pytest.mark.parametrize('delta_softplus', [False, True])
+@pytest.mark.parametrize('delta_softplus', [True])
+# @pytest.mark.parametrize('has_z', [False, True])
+@pytest.mark.parametrize('has_z', [True])
+# @pytest.mark.parametrize('has_D', [False, True])
+@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("varBC_groups", [1, 2])
+# @pytest.mark.parametrize("varBC_groups", [1])
+# @pytest.mark.parametrize("is_variable_C", [False, True])
+@pytest.mark.parametrize("is_variable_C", [True])
+# @pytest.mark.parametrize("is_variable_B", [False, True])
+@pytest.mark.parametrize("is_variable_B", [True])
+def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D, has_z, has_delta_bias,
+                        delta_softplus, return_last_state, seqlen, itype, wtype):
+    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
+        pytest.skip()  # This config is not applicable
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    if has_z:  # If we have z, the errors on the weights seem higher
+        rtolw = max(rtolw, rtol)
+        atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    dim = 4
+    dstate = 8
+    is_complex = wtype == torch.complex64
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    if not is_variable_B:
+        B_shape = (dim, dstate)
+    elif varBC_groups == 1:
+        B_shape = (batch_size, dstate, seqlen if not is_complex else seqlen * 2)
+    else:
+        B_shape = (batch_size, varBC_groups, dstate, seqlen if not is_complex else seqlen * 2)
+    B = torch.randn(*B_shape, device=device, dtype=wtype if not is_variable_B else itype,
+                    requires_grad=True)
+    if not is_variable_C:
+        C_shape = (dim, dstate)
+    elif varBC_groups == 1:
+        C_shape = (batch_size, dstate, seqlen if not is_complex else seqlen * 2)
+    else:
+        C_shape = (batch_size, varBC_groups, dstate, seqlen if not is_complex else seqlen * 2)
+    C = torch.randn(*C_shape, device=device, dtype=wtype if not is_variable_C else itype,
+                    requires_grad=True)
+    if has_D:
+        D = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        D = None
+    if has_z:
+        z = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    else:
+        z = None
+    if has_delta_bias:
+        delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)).requires_grad_()
+    else:
+        delta_bias = None
+    u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    delta = (0.5 * torch.rand(batch_size, dim, seqlen, device=device, dtype=itype)).requires_grad_()
+    A_ref = A.detach().clone().requires_grad_()
+    B_ref = B.detach().clone().requires_grad_()
+    C_ref = C.detach().clone().requires_grad_()
+    D_ref = D.detach().clone().requires_grad_() if D is not None else None
+    z_ref = z.detach().clone().requires_grad_() if z is not None else None
+    u_ref = u.detach().clone().requires_grad_()
+    delta_ref = delta.detach().clone().requires_grad_()
+    delta_bias_ref = delta_bias.detach().clone().requires_grad_() if delta_bias is not None else None
+    out, *rest = selective_scan_fn(
+        u, delta, A, B, C, D, z=z,
+        delta_bias=delta_bias, delta_softplus=delta_softplus,
+        return_last_state=return_last_state
+    )
+    if return_last_state:
+        state = rest[0]
+    out_ref, *rest = selective_scan_ref(
+        u_ref, delta_ref, A_ref, B_ref, C_ref, D_ref, z=z_ref,
+        delta_bias=delta_bias_ref, delta_softplus=delta_softplus,
+        return_last_state=return_last_state
+    )
+    if return_last_state:
+        state_ref = rest[0]
+    # dA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    # dt_u = delta * u
+
+    print(f'Output max diff: {(out - out_ref).abs().max().item()}')
+    print(f'Output mean diff: {(out - out_ref).abs().mean().item()}')
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+    if return_last_state:
+        print(f'State max diff: {(state - state_ref).abs().max().item()}')
+        assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+
+    g = torch.randn_like(out)
+    out_ref.backward(g)
+    out.backward(g)
+
+    print(f'du max diff: {(u.grad - u_ref.grad).abs().max().item()}')
+    print(f'ddelta max diff: {(delta.grad - delta_ref.grad).abs().max().item()}')
+    print(f'dA max diff: {(A.grad - A_ref.grad).abs().max().item()}')
+    print(f'dB max diff: {(B.grad - B_ref.grad).abs().max().item()}')
+    print(f'dC max diff: {(C.grad - C_ref.grad).abs().max().item()}')
+    if has_D:
+        print(f'dD max diff: {(D.grad - D_ref.grad).abs().max().item()}')
+    if has_z:
+        print(f'dz max diff: {(z.grad - z_ref.grad).abs().max().item()}')
+    if has_delta_bias:
+        print(f'ddelta_bias max diff: {(delta_bias.grad - delta_bias_ref.grad).abs().max().item()}')
+
+    assert torch.allclose(u.grad, u_ref.grad.to(dtype=itype), rtol=rtol * 2, atol=atol * 2)
+    assert torch.allclose(delta.grad, delta_ref.grad.to(dtype=itype), rtol=rtol * 5, atol=atol * 10)
+    assert torch.allclose(A.grad, A_ref.grad, rtol=rtolw, atol=atolw * 5)
+    assert torch.allclose(B.grad, B_ref.grad, rtol=rtolw if not is_variable_B else rtol,
+                          atol=atolw if not is_variable_B else atol)
+    assert torch.allclose(C.grad, C_ref.grad, rtol=rtolw if not is_variable_C else rtol,
+                          atol=atolw if not is_variable_C else atol)
+    if has_D:
+        assert torch.allclose(D.grad, D_ref.grad, rtol=rtolw, atol=atolw)
+    if has_z:
+        assert torch.allclose(z.grad, z_ref.grad, rtol=rtolw, atol=atolw)
+    if has_delta_bias:
+        assert torch.allclose(delta_bias.grad, delta_bias_ref.grad, rtol=rtolw, atol=atolw)
+
+
+@pytest.mark.parametrize('wtype', [torch.float32, torch.complex64])
+# @pytest.mark.parametrize('wtype', [torch.complex64])
+# @pytest.mark.parametrize('itype', [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.float32])
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 372, 512, 784, 1024, 1134, 2048, 4096])
+@pytest.mark.parametrize('seqlen', [128])
+@pytest.mark.parametrize("is_variable_C", [False, True])
+# @pytest.mark.parametrize("is_variable_C", [False])
+@pytest.mark.parametrize("is_variable_B", [False, True])
+# @pytest.mark.parametrize("is_variable_B", [True])
+def test_mamba_inner_fn(is_variable_B, is_variable_C, seqlen, itype, wtype):
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # If we have z, the errors on the weights seem higher
+    rtolw = max(rtolw, rtol)
+    atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    dim = 768
+    dstate = 8
+    dt_rank = 48
+    is_complex = wtype == torch.complex64
+    xz = torch.randn(batch_size, 2 * dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    conv1d_weight = torch.randn(dim, 1, 3, device=device, dtype=torch.float32, requires_grad=True)
+    conv1d_bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    x_proj_weight = torch.randn(dt_rank + (bool(is_variable_B) + bool(is_variable_C)) * dstate
+                                * (1 if not is_complex else 2),
+                                dim, device=device, dtype=itype, requires_grad=True)
+    delta_proj_weight = torch.randn(dim, dt_rank, device=device, dtype=itype, requires_grad=True)
+    out_proj_weight = torch.randn(dim // 2, dim, device=device, dtype=itype, requires_grad=True)
+    out_proj_bias = None
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    B = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_B else None)
+    C = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_C else None)
+    D = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)).requires_grad_()
+    B_proj_bias = None
+    C_proj_bias = None
+    xz_ref = xz.detach().clone().requires_grad_()
+    conv1d_weight_ref = conv1d_weight.detach().clone().requires_grad_()
+    conv1d_bias_ref = conv1d_bias.detach().clone().requires_grad_()
+    x_proj_weight_ref = x_proj_weight.detach().clone().requires_grad_()
+    delta_proj_weight_ref = delta_proj_weight.detach().clone().requires_grad_()
+    out_proj_weight_ref = out_proj_weight.detach().clone().requires_grad_()
+    out_proj_bias_ref = (out_proj_bias.detach().clone().requires_grad_()
+                         if out_proj_bias is not None else None)
+    A_ref = A.detach().clone().requires_grad_()
+    B_ref = B.detach().clone().requires_grad_() if B is not None else None
+    C_ref = C.detach().clone().requires_grad_() if C is not None else None
+    D_ref = D.detach().clone().requires_grad_()
+    delta_bias_ref = delta_bias.detach().clone().requires_grad_() if delta_bias is not None else None
+    out = mamba_inner_fn(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                         out_proj_weight, out_proj_bias,
+                         A, B, C, D, delta_bias=delta_bias, delta_softplus=True)
+    out_ref = mamba_inner_ref(xz_ref, conv1d_weight_ref, conv1d_bias_ref, x_proj_weight_ref,
+                              delta_proj_weight_ref, out_proj_weight_ref, out_proj_bias_ref,
+                              A_ref, B_ref, C_ref, D_ref,
+                              delta_bias=delta_bias_ref, delta_softplus=True)
+    # dA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    # dt_u = delta * u
+    print("mamba_inner_fn")
+    print(f'Output max diff: {(out - out_ref).abs().max().item()}')
+    print(f'Output mean diff: {(out - out_ref).abs().mean().item()}')
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    g = torch.randn_like(out)
+    out_ref.backward(g)
+    out.backward(g)
+
+    print(f'dxz max diff: {(xz.grad - xz_ref.grad).abs().max().item()}')
+    print(f'dA max diff: {(A.grad - A_ref.grad).abs().max().item()}')
+    if not is_variable_B:
+        print(f'dB max diff: {(B.grad - B_ref.grad).abs().max().item()}')
+    if not is_variable_C:
+        print(f'dC max diff: {(C.grad - C_ref.grad).abs().max().item()}')
+    print(f'dD max diff: {(D.grad - D_ref.grad).abs().max().item()}')
+    print(f'ddelta_bias max diff: {(delta_bias.grad - delta_bias_ref.grad).abs().max().item()}')
+    print(f'dout_proj_weight max diff: {(out_proj_weight.grad - out_proj_weight_ref.grad).abs().max().item()}')
+    print(f'ddelta_proj_weight max diff: {(delta_proj_weight.grad - delta_proj_weight_ref.grad).abs().max().item()}')
+    print(f'dx_proj_weight max diff: {(x_proj_weight.grad - x_proj_weight_ref.grad).abs().max().item()}')
+    print(f'dconv1d_weight max diff: {(conv1d_weight.grad - conv1d_weight_ref.grad).abs().max().item()}')
+    print(f'dconv1d_bias max diff: {(conv1d_bias.grad - conv1d_bias_ref.grad).abs().max().item()}')
+
+    # assert torch.allclose(xz.grad, xz_ref.grad.to(dtype=itype), rtol=rtol * 2, atol=atol * 2)
+    # assert torch.allclose(delta.grad, delta_ref.grad.to(dtype=itype), rtol=rtol * 5, atol=atol * 10)
+    # assert torch.allclose(A.grad, A_ref.grad, rtol=rtolw, atol=atolw * 5)
+    # assert torch.allclose(B.grad, B_ref.grad, rtol=rtolw if not is_variable_B else rtol,
+    #                       atol=atolw if not is_variable_B else atol)
+    # assert torch.allclose(C.grad, C_ref.grad, rtol=rtolw if not is_variable_C else rtol,
+    #                       atol=atolw if not is_variable_C else atol)
+    # assert torch.allclose(D.grad, D_ref.grad, rtol=rtolw, atol=atolw)
+    # assert torch.allclose(delta_bias.grad, delta_bias_ref.grad, rtol=rtolw, atol=atolw)
+
+
+# test_mamba_inner_fn(False, False, 128, torch.float32, torch.float32)
+
+
+@pytest.mark.parametrize('wtype', [torch.float32, torch.complex64])
+# @pytest.mark.parametrize('wtype', [torch.complex64])
+# @pytest.mark.parametrize('itype', [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.float32])
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 372, 512, 784, 1024, 1134, 2048, 4096])
+@pytest.mark.parametrize('seqlen', [128])
+@pytest.mark.parametrize("is_variable_C", [False, True])
+# @pytest.mark.parametrize("is_variable_C", [False])
+@pytest.mark.parametrize("is_variable_B", [False, True])
+# @pytest.mark.parametrize("is_variable_B", [True])
+def test_bimamba_inner_fn(is_variable_B, is_variable_C, seqlen, itype, wtype):
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # If we have z, the errors on the weights seem higher
+    rtolw = max(rtolw, rtol)
+    atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    dim = 768
+    dstate = 8
+    dt_rank = 48
+    is_complex = wtype == torch.complex64
+    xz = torch.randn(batch_size, 2 * dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    conv1d_weight = torch.randn(dim, 1, 3, device=device, dtype=torch.float32, requires_grad=True)
+    conv1d_bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    x_proj_weight = torch.randn(dt_rank + (bool(is_variable_B) + bool(is_variable_C)) * dstate
+                                * (1 if not is_complex else 2),
+                                dim, device=device, dtype=itype, requires_grad=True)
+    delta_proj_weight = torch.randn(dim, dt_rank, device=device, dtype=itype, requires_grad=True)
+    out_proj_weight = torch.randn(dim // 2, dim, device=device, dtype=itype, requires_grad=True)
+    out_proj_bias = None
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    A_b = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    B = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_B else None)
+    C = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_C else None)
+    D = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)).requires_grad_()
+    B_proj_bias = None
+    C_proj_bias = None
+    xz_ref = xz.detach().clone().requires_grad_()
+    conv1d_weight_ref = conv1d_weight.detach().clone().requires_grad_()
+    conv1d_bias_ref = conv1d_bias.detach().clone().requires_grad_()
+    x_proj_weight_ref = x_proj_weight.detach().clone().requires_grad_()
+    delta_proj_weight_ref = delta_proj_weight.detach().clone().requires_grad_()
+    out_proj_weight_ref = out_proj_weight.detach().clone().requires_grad_()
+    out_proj_bias_ref = (out_proj_bias.detach().clone().requires_grad_()
+                         if out_proj_bias is not None else None)
+    A_ref = A.detach().clone().requires_grad_()
+    A_b_ref = A_b.detach().clone().requires_grad_()
+    B_ref = B.detach().clone().requires_grad_() if B is not None else None
+    C_ref = C.detach().clone().requires_grad_() if C is not None else None
+    D_ref = D.detach().clone().requires_grad_()
+    delta_bias_ref = delta_bias.detach().clone().requires_grad_() if delta_bias is not None else None
+    out = bimamba_inner_fn(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                         out_proj_weight, out_proj_bias,
+                         A, A_b, B, C, D, delta_bias=delta_bias, delta_softplus=True)
+    out_ref = bimamba_inner_fn(xz_ref, conv1d_weight_ref, conv1d_bias_ref, x_proj_weight_ref,
+                              delta_proj_weight_ref, out_proj_weight_ref, out_proj_bias_ref,
+                              A_ref, A_b_ref, B_ref, C_ref, D_ref,
+                              delta_bias=delta_bias_ref, delta_softplus=True)
+    # dA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    # dt_u = delta * u
+    print("bimamba_inner_fn")
+    print(f'Output max diff: {(out - out_ref).abs().max().item()}')
+    print(f'Output mean diff: {(out - out_ref).abs().mean().item()}')
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    g = torch.randn_like(out)
+    out_ref.backward(g)
+    out.backward(g)
+
+    print(f'dxz max diff: {(xz.grad - xz_ref.grad).abs().max().item()}')
+    print(f'dA max diff: {(A.grad - A_ref.grad).abs().max().item()}')
+    print(f'dA_b max diff: {(A_b.grad - A_b_ref.grad).abs().max().item()}')
+    if not is_variable_B:
+        print(f'dB max diff: {(B.grad - B_ref.grad).abs().max().item()}')
+    if not is_variable_C:
+        print(f'dC max diff: {(C.grad - C_ref.grad).abs().max().item()}')
+    print(f'dD max diff: {(D.grad - D_ref.grad).abs().max().item()}')
+    print(f'ddelta_bias max diff: {(delta_bias.grad - delta_bias_ref.grad).abs().max().item()}')
+    print(f'dout_proj_weight max diff: {(out_proj_weight.grad - out_proj_weight_ref.grad).abs().max().item()}')
+    print(f'ddelta_proj_weight max diff: {(delta_proj_weight.grad - delta_proj_weight_ref.grad).abs().max().item()}')
+    print(f'dx_proj_weight max diff: {(x_proj_weight.grad - x_proj_weight_ref.grad).abs().max().item()}')
+    print(f'dconv1d_weight max diff: {(conv1d_weight.grad - conv1d_weight_ref.grad).abs().max().item()}')
+    print(f'dconv1d_bias max diff: {(conv1d_bias.grad - conv1d_bias_ref.grad).abs().max().item()}')
+
+@pytest.mark.parametrize('wtype', [torch.float32, torch.complex64])
+# @pytest.mark.parametrize('wtype', [torch.complex64])
+# @pytest.mark.parametrize('itype', [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.float32])
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 372, 512, 784, 1024, 1134, 2048, 4096])
+@pytest.mark.parametrize('seqlen', [128])
+@pytest.mark.parametrize("is_variable_C", [False, True])
+# @pytest.mark.parametrize("is_variable_C", [False])
+@pytest.mark.parametrize("is_variable_B", [False, True])
+# @pytest.mark.parametrize("is_variable_B", [True])
+def test_bimamba_inner_fn_grad_check(is_variable_B, is_variable_C, seqlen, itype, wtype):
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # If we have z, the errors on the weights seem higher
+    rtolw = max(rtolw, rtol)
+    atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2 // 2
+    dim = 768 // 8
+    dstate = 8 // 8
+    dt_rank = 48 // 8
+    is_complex = wtype == torch.complex64
+    xz = torch.randn(batch_size, 2 * dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    conv1d_weight = torch.randn(dim, 1, 3, device=device, dtype=torch.float32, requires_grad=True)
+    conv1d_bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    x_proj_weight = torch.randn(dt_rank + (bool(is_variable_B) + bool(is_variable_C)) * dstate
+                                * (1 if not is_complex else 2),
+                                dim, device=device, dtype=itype, requires_grad=True)
+    delta_proj_weight = torch.randn(dim, dt_rank, device=device, dtype=itype, requires_grad=True)
+    out_proj_weight = torch.randn(dim // 2, dim, device=device, dtype=itype, requires_grad=True)
+    out_proj_bias = None
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    A_b = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    B = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_B else None)
+    C = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_C else None)
+    D = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)).requires_grad_()
+    B_proj_bias = None
+    C_proj_bias = None
+    xz_ref = xz.detach().clone().requires_grad_()
+    conv1d_weight_ref = conv1d_weight.detach().clone().requires_grad_()
+    conv1d_bias_ref = conv1d_bias.detach().clone().requires_grad_()
+    x_proj_weight_ref = x_proj_weight.detach().clone().requires_grad_()
+    delta_proj_weight_ref = delta_proj_weight.detach().clone().requires_grad_()
+    out_proj_weight_ref = out_proj_weight.detach().clone().requires_grad_()
+    out_proj_bias_ref = (out_proj_bias.detach().clone().requires_grad_()
+                         if out_proj_bias is not None else None)
+    A_ref = A.detach().clone().requires_grad_()
+    A_b_ref = A_b.detach().clone().requires_grad_()
+    B_ref = B.detach().clone().requires_grad_() if B is not None else None
+    C_ref = C.detach().clone().requires_grad_() if C is not None else None
+    D_ref = D.detach().clone().requires_grad_()
+    delta_bias_ref = delta_bias.detach().clone().requires_grad_() if delta_bias is not None else None
+
+    # func = bimamba_inner_fn
+    # func = mamba_inner_fn
+    func = mamba_inner_ref
+
+    # gradok = gradcheck(func, (xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,out_proj_weight, out_proj_bias, A, A_b, B, C, D, delta_bias, None, None, True))
+    gradok = gradcheck(func, (xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,out_proj_weight, out_proj_bias, A, B, C, D, delta_bias, None, None, True), eps=1e-6, atol=1e-4, nondet_tol=1.)
+    print(f'* {gradok} check_gradient_numerical bimamba_inner_fn')
+
+
+
+# test_bimamba_inner_fn(True, True, 128, torch.float32, torch.float32)
+# test_mamba_inner_fn(True, True, 128, torch.float32, torch.float32)
+test_bimamba_inner_fn_grad_check(True, True, 128, torch.float32, torch.float32)
+
+# input = (torch.randn(20,20,dtype=torch.double,requires_grad=True), torch.randn(30,20,dtype=torch.double,requires_grad=True))
+# test = gradcheck(torch.nn.functional.linear, input, eps=1e-6, atol=1e-4)
+# print(test)
\ No newline at end of file
diff --git a/mamba/tests/ops/triton/test_selective_state_update.py b/mamba/tests/ops/triton/test_selective_state_update.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a8d79d9cad3e4d33897478caf178bd96d0ae5a
--- /dev/null
+++ b/mamba/tests/ops/triton/test_selective_state_update.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2023, Tri Dao.
+
+import math
+
+import torch
+import torch.nn.functional as F
+import pytest
+
+from einops import rearrange
+
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update, selective_state_update_ref
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("has_z", [False, True])
+# @pytest.mark.parametrize('has_z', [True])
+@pytest.mark.parametrize("dstate", [16, 32, 64])
+# @pytest.mark.parametrize("dstate", [16])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+# @pytest.mark.parametrize("dim", [2048])
+def test_causal_conv1d_update(dim, dstate, has_z, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
+    x = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(batch_size, dstate, device=device)
+    C = torch.randn(batch_size, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    if has_z:
+        z = torch.randn_like(x)
+    else:
+        z = None
+    state_ref = state.detach().clone()
+    out = selective_state_update(state, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True)
+    out_ref = selective_state_update_ref(state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..de86068fe4d98f7c56f838c88b943716890a4b29
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,34 @@
+apex==0.1
+av==11.0.0
+decord==0.6.0
+deepspeed==0.13.1
+einops==0.7.0
+ftfy==6.1.3
+fvcore==0.1.5.post20221221
+imageio==2.33.1
+lm_eval==0.4.1
+numpy==1.26.4
+omegaconf==2.3.0
+opencv_python==4.8.1.78
+packaging==24.0
+pandas==2.2.1
+Pillow==10.1.0
+pytest==8.1.1
+PyYAML==6.0.1
+regex==2023.10.3
+Requests==2.31.0
+scipy==1.12.0
+setuptools==68.2.2
+skimage==0.0
+submitit==1.5.1
+tensorboardX==2.6.2.2
+tensorflow==2.16.1
+termcolor==2.4.0
+timm==0.4.12
+torch==2.1.1
+torchvision==0.16.1
+tqdm==4.66.1
+transformers==4.36.1
+wandb==0.16.2
+wheel==0.42.0
+xformers==0.0.24
diff --git a/transforms.py b/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..2483fdf8569e25978b922774e84cc2244315fe61
--- /dev/null
+++ b/transforms.py
@@ -0,0 +1,443 @@
+import torchvision
+import random
+from PIL import Image, ImageOps
+import numpy as np
+import numbers
+import math
+import torch
+
+
+class GroupRandomCrop(object):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, img_group):
+
+        w, h = img_group[0].size
+        th, tw = self.size
+
+        out_images = list()
+
+        x1 = random.randint(0, w - tw)
+        y1 = random.randint(0, h - th)
+
+        for img in img_group:
+            assert(img.size[0] == w and img.size[1] == h)
+            if w == tw and h == th:
+                out_images.append(img)
+            else:
+                out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+        return out_images
+
+
+class MultiGroupRandomCrop(object):
+    def __init__(self, size, groups=1):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.groups = groups
+
+    def __call__(self, img_group):
+
+        w, h = img_group[0].size
+        th, tw = self.size
+
+        out_images = list()
+
+        for i in range(self.groups):
+            x1 = random.randint(0, w - tw)
+            y1 = random.randint(0, h - th)
+
+            for img in img_group:
+                assert(img.size[0] == w and img.size[1] == h)
+                if w == tw and h == th:
+                    out_images.append(img)
+                else:
+                    out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+        return out_images
+
+
+class GroupCenterCrop(object):
+    def __init__(self, size):
+        self.worker = torchvision.transforms.CenterCrop(size)
+
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+
+
+class GroupRandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+
+    def __init__(self, is_flow=False):
+        self.is_flow = is_flow
+
+    def __call__(self, img_group, is_flow=False):
+        v = random.random()
+        if v < 0.5:
+            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+            if self.is_flow:
+                for i in range(0, len(ret), 2):
+                    # invert flow pixel values when flipping
+                    ret[i] = ImageOps.invert(ret[i])
+            return ret
+        else:
+            return img_group
+
+
+class GroupNormalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, tensor):
+        rep_mean = self.mean * (tensor.size()[0] // len(self.mean))
+        rep_std = self.std * (tensor.size()[0] // len(self.std))
+
+        # TODO: make efficient
+        for t, m, s in zip(tensor, rep_mean, rep_std):
+            t.sub_(m).div_(s)
+
+        return tensor
+
+
+class GroupScale(object):
+    """ Rescales the input PIL.Image to the given 'size'.
+    'size' will be the size of the smaller edge.
+    For example, if height > width, then image will be
+    rescaled to (size * height / width, size)
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.worker = torchvision.transforms.Resize(size, interpolation)
+
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+
+
+class GroupOverSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(
+            crop_size, int) else (crop_size, crop_size)
+
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+
+    def __call__(self, img_group):
+
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+
+        offsets = GroupMultiScaleCrop.fill_fix_offset(
+            False, image_w, image_h, crop_w, crop_h)
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+
+                if img.mode == 'L' and i % 2 == 0:
+                    flip_group.append(ImageOps.invert(flip_crop))
+                else:
+                    flip_group.append(flip_crop)
+
+            oversample_group.extend(normal_group)
+            if self.flip:
+                oversample_group.extend(flip_group)
+        return oversample_group
+
+
+class GroupFullResSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(
+            crop_size, int) else (crop_size, crop_size)
+
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+
+    def __call__(self, img_group):
+
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+
+        offsets = list()
+        offsets.append((0 * w_step, 2 * h_step))  # left
+        offsets.append((4 * w_step, 2 * h_step))  # right
+        offsets.append((2 * w_step, 2 * h_step))  # center
+
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                if self.flip:
+                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+
+                    if img.mode == 'L' and i % 2 == 0:
+                        flip_group.append(ImageOps.invert(flip_crop))
+                    else:
+                        flip_group.append(flip_crop)
+
+            oversample_group.extend(normal_group)
+            oversample_group.extend(flip_group)
+        return oversample_group
+
+
+class GroupMultiScaleCrop(object):
+
+    def __init__(self, input_size, scales=None, max_distort=1,
+                 fix_crop=True, more_fix_crop=True):
+        self.scales = scales if scales is not None else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.more_fix_crop = more_fix_crop
+        self.input_size = input_size if not isinstance(input_size, int) else [
+            input_size, input_size]
+        self.interpolation = Image.BILINEAR
+
+    def __call__(self, img_group):
+
+        im_size = img_group[0].size
+
+        crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
+        crop_img_group = [
+            img.crop(
+                (offset_w,
+                 offset_h,
+                 offset_w +
+                 crop_w,
+                 offset_h +
+                 crop_h)) for img in img_group]
+        ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
+                         for img in crop_img_group]
+        return ret_img_group
+
+    def _sample_crop_size(self, im_size):
+        image_w, image_h = im_size[0], im_size[1]
+
+        # find a crop size
+        base_size = min(image_w, image_h)
+        crop_sizes = [int(base_size * x) for x in self.scales]
+        crop_h = [
+            self.input_size[1] if abs(
+                x - self.input_size[1]) < 3 else x for x in crop_sizes]
+        crop_w = [
+            self.input_size[0] if abs(
+                x - self.input_size[0]) < 3 else x for x in crop_sizes]
+
+        pairs = []
+        for i, h in enumerate(crop_h):
+            for j, w in enumerate(crop_w):
+                if abs(i - j) <= self.max_distort:
+                    pairs.append((w, h))
+
+        crop_pair = random.choice(pairs)
+        if not self.fix_crop:
+            w_offset = random.randint(0, image_w - crop_pair[0])
+            h_offset = random.randint(0, image_h - crop_pair[1])
+        else:
+            w_offset, h_offset = self._sample_fix_offset(
+                image_w, image_h, crop_pair[0], crop_pair[1])
+
+        return crop_pair[0], crop_pair[1], w_offset, h_offset
+
+    def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
+        offsets = self.fill_fix_offset(
+            self.more_fix_crop, image_w, image_h, crop_w, crop_h)
+        return random.choice(offsets)
+
+    @staticmethod
+    def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+
+        ret = list()
+        ret.append((0, 0))  # upper left
+        ret.append((4 * w_step, 0))  # upper right
+        ret.append((0, 4 * h_step))  # lower left
+        ret.append((4 * w_step, 4 * h_step))  # lower right
+        ret.append((2 * w_step, 2 * h_step))  # center
+
+        if more_fix_crop:
+            ret.append((0, 2 * h_step))  # center left
+            ret.append((4 * w_step, 2 * h_step))  # center right
+            ret.append((2 * w_step, 4 * h_step))  # lower center
+            ret.append((2 * w_step, 0 * h_step))  # upper center
+
+            ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+            ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+            ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+            ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+        return ret
+
+
+class GroupRandomSizedCrop(object):
+    """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
+    and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
+    This is popularly used to train the Inception networks
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.size = size
+        self.interpolation = interpolation
+
+    def __call__(self, img_group):
+        for attempt in range(10):
+            area = img_group[0].size[0] * img_group[0].size[1]
+            target_area = random.uniform(0.08, 1.0) * area
+            aspect_ratio = random.uniform(3. / 4, 4. / 3)
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if random.random() < 0.5:
+                w, h = h, w
+
+            if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
+                x1 = random.randint(0, img_group[0].size[0] - w)
+                y1 = random.randint(0, img_group[0].size[1] - h)
+                found = True
+                break
+        else:
+            found = False
+            x1 = 0
+            y1 = 0
+
+        if found:
+            out_group = list()
+            for img in img_group:
+                img = img.crop((x1, y1, x1 + w, y1 + h))
+                assert(img.size == (w, h))
+                out_group.append(
+                    img.resize(
+                        (self.size, self.size), self.interpolation))
+            return out_group
+        else:
+            # Fallback
+            scale = GroupScale(self.size, interpolation=self.interpolation)
+            crop = GroupRandomCrop(self.size)
+            return crop(scale(img_group))
+
+
+class ConvertDataFormat(object):
+    def __init__(self, model_type):
+        self.model_type = model_type
+
+    def __call__(self, images):
+        if self.model_type == '2D':
+            return images
+        tc, h, w = images.size()
+        t = tc // 3
+        images = images.view(t, 3, h, w)
+        images = images.permute(1, 0, 2, 3)
+        return images
+
+
+class Stack(object):
+
+    def __init__(self, roll=False):
+        self.roll = roll
+
+    def __call__(self, img_group):
+        if img_group[0].mode == 'L':
+            return np.concatenate([np.expand_dims(x, 2)
+                                   for x in img_group], axis=2)
+        elif img_group[0].mode == 'RGB':
+            if self.roll:
+                return np.concatenate([np.array(x)[:, :, ::-1]
+                                       for x in img_group], axis=2)
+            else:
+                #print(np.concatenate(img_group, axis=2).shape)
+                # print(img_group[0].shape)
+                return np.concatenate(img_group, axis=2)
+
+
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+
+    def __init__(self, div=True):
+        self.div = div
+
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            # handle numpy array
+            img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(
+                    pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        return img.float().div(255) if self.div else img.float()
+
+
+class IdentityTransform(object):
+
+    def __call__(self, data):
+        return data
+
+
+if __name__ == "__main__":
+    trans = torchvision.transforms.Compose([
+        GroupScale(256),
+        GroupRandomCrop(224),
+        Stack(),
+        ToTorchFormatTensor(),
+        GroupNormalize(
+            mean=[.485, .456, .406],
+            std=[.229, .224, .225]
+        )]
+    )
+
+    im = Image.open('../tensorflow-model-zoo.torch/lena_299.png')
+
+    color_group = [im] * 3
+    rst = trans(color_group)
+
+    gray_group = [im.convert('L')] * 9
+    gray_rst = trans(gray_group)
+
+    trans2 = torchvision.transforms.Compose([
+        GroupRandomSizedCrop(256),
+        Stack(),
+        ToTorchFormatTensor(),
+        GroupNormalize(
+            mean=[.485, .456, .406],
+            std=[.229, .224, .225])
+    ])
+    print(trans2(color_group))
diff --git a/videomamba_image.py b/videomamba_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c06537a317568fdb491f7b947f79cdc73cb090
--- /dev/null
+++ b/videomamba_image.py
@@ -0,0 +1,363 @@
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+import torch
+import torch.nn as nn
+from functools import partial
+from torch import Tensor
+from typing import Optional
+
+from timm.models.vision_transformer import _cfg
+from timm.models.layers import trunc_normal_
+
+from timm.models.layers import DropPath, to_2tuple
+from timm.models.vision_transformer import _load_weights
+
+import math
+
+from mamba_ssm.modules.mamba_simple import Mamba
+
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, stride=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.grid_size = ((img_size[0] - patch_size[0]) // stride + 1, (img_size[1] - patch_size[1]) // stride + 1)
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self, dim, mixer_cls, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False,drop_path=0.,
+    ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(dim)
+        self.norm = norm_cls(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+            residual = (residual + self.drop_path(hidden_states)) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states if residual is None else self.drop_path(hidden_states),
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states, residual
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+
+
+def create_block(
+    d_model,
+    ssm_cfg=None,
+    norm_epsilon=1e-5,
+    drop_path=0.,
+    rms_norm=True,
+    residual_in_fp32=True,
+    fused_add_norm=True,
+    layer_idx=None,
+    bimamba=True,
+    device=None,
+    dtype=None,
+):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    mixer_cls = partial(Mamba, layer_idx=layer_idx, bimamba=bimamba, **ssm_cfg, **factory_kwargs)
+    norm_cls = partial(nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon)
+    block = Block(
+        d_model,
+        mixer_cls,
+        norm_cls=norm_cls,
+        drop_path=drop_path,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    block.layer_idx = layer_idx
+    return block
+
+
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+
+
+def segm_init_weights(m):
+    if isinstance(m, nn.Linear):
+        trunc_normal_(m.weight, std=0.02)
+        if isinstance(m, nn.Linear) and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.constant_(m.bias, 0)
+        nn.init.constant_(m.weight, 1.0)
+
+
+class VisionMamba(nn.Module):
+    def __init__(
+            self, 
+            img_size=224, 
+            patch_size=16, 
+            stride=16,
+            depth=24, 
+            embed_dim=192, 
+            channels=3, 
+            num_classes=1000,
+            drop_rate=0.,
+            drop_path_rate=0.1,
+            ssm_cfg=None, 
+            norm_epsilon=1e-5, 
+            initializer_cfg=None,
+            fused_add_norm=True,
+            rms_norm=True, 
+            residual_in_fp32=True,
+            bimamba=True,
+            device=None,
+            dtype=None,
+        ):
+        factory_kwargs = {"device": device, "dtype": dtype} # follow MambaLMHeadModel
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+
+        # pretrain parameters
+        self.num_classes = num_classes
+        self.d_model = self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, stride=stride, in_chans=channels, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, self.embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        inter_dpr = [0.0] + dpr
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        # mamba blocks
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    embed_dim,
+                    ssm_cfg=ssm_cfg,
+                    norm_epsilon=norm_epsilon,
+                    rms_norm=rms_norm,
+                    residual_in_fp32=residual_in_fp32,
+                    fused_add_norm=fused_add_norm,
+                    layer_idx=i,
+                    bimamba=bimamba,
+                    drop_path=inter_dpr[i],
+                    **factory_kwargs,
+                )
+                for i in range(depth)
+            ]
+        )
+        
+        # output head
+        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(embed_dim, eps=norm_epsilon, **factory_kwargs)
+
+        # original init
+        self.apply(segm_init_weights)
+        self.head.apply(segm_init_weights)
+        trunc_normal_(self.pos_embed, std=.02)
+
+        # mamba init
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=depth,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=""):
+        _load_weights(self, checkpoint_path, prefix)
+
+    def forward_features(self, x, inference_params=None):
+        x = self.patch_embed(x)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_token, x), dim=1)
+
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        # mamba impl
+        residual = None
+        hidden_states = x
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                hidden_states, residual, inference_params=inference_params
+            )
+
+        if not self.fused_add_norm:
+            if residual is None:
+                residual = hidden_states
+            else:
+                residual = residual + self.drop_path(hidden_states)
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            # Set prenorm=False here since we don't need the residual
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            hidden_states = fused_add_norm_fn(
+                self.drop_path(hidden_states),
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+
+        # return only cls token
+        return hidden_states[:, 0, :]
+
+    def forward(self, x, inference_params=None):
+        x = self.forward_features(x, inference_params)
+        x = self.head(x)
+        return x
+
+
+def videomamba_image_tiny(**kwargs):
+    model = VisionMamba(
+        patch_size=16, 
+        embed_dim=192, 
+        depth=24, 
+        rms_norm=True, 
+        residual_in_fp32=True, 
+        fused_add_norm=True, 
+        **kwargs
+    )
+    model.default_cfg = _cfg()
+    return model
+
+
+def videomamba_image_small(**kwargs):
+    model = VisionMamba(
+        patch_size=16, 
+        embed_dim=384,
+        depth=24, 
+        rms_norm=True, 
+        residual_in_fp32=True, 
+        fused_add_norm=True, 
+        **kwargs
+    )
+    model.default_cfg = _cfg()
+    return model
+
+
+def videomamba_image_middle(**kwargs):
+    model = VisionMamba(
+        patch_size=16, 
+        embed_dim=576,
+        depth=32, 
+        rms_norm=True, 
+        residual_in_fp32=True, 
+        fused_add_norm=True, 
+        **kwargs
+    )
+    model.default_cfg = _cfg()
+    return model
diff --git a/videomamba_video.py b/videomamba_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..a76561e60de92eab1cde7a18f42be8c19d6acc0b
--- /dev/null
+++ b/videomamba_video.py
@@ -0,0 +1,431 @@
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+import os
+import torch
+import torch.nn as nn
+from functools import partial
+from torch import Tensor
+from typing import Optional
+import torch.utils.checkpoint as checkpoint
+
+from einops import rearrange
+from timm.models.vision_transformer import _cfg
+from timm.models.layers import trunc_normal_
+
+from timm.models.layers import DropPath, to_2tuple
+from timm.models.vision_transformer import _load_weights
+
+import math
+
+from mamba_ssm.modules.mamba_simple import Mamba
+
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+
+
+class Block(nn.Module):
+    def __init__(
+        self, dim, mixer_cls, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False,drop_path=0.,
+    ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(dim)
+        self.norm = norm_cls(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None,
+        use_checkpoint=False
+    ):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+            residual = (residual + self.drop_path(hidden_states)) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states if residual is None else self.drop_path(hidden_states),
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        if use_checkpoint:
+            hidden_states = checkpoint.checkpoint(self.mixer, hidden_states, inference_params)
+        else:
+            hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states, residual
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+
+
+def create_block(
+    d_model,
+    ssm_cfg=None,
+    norm_epsilon=1e-5,
+    drop_path=0.,
+    rms_norm=True,
+    residual_in_fp32=True,
+    fused_add_norm=True,
+    layer_idx=None,
+    bimamba=True,
+    device=None,
+    dtype=None,
+):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    mixer_cls = partial(Mamba, layer_idx=layer_idx, bimamba=bimamba, **ssm_cfg, **factory_kwargs)
+    norm_cls = partial(nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon)
+    block = Block(
+        d_model,
+        mixer_cls,
+        norm_cls=norm_cls,
+        drop_path=drop_path,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    block.layer_idx = layer_idx
+    return block
+
+
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+
+
+def segm_init_weights(m):
+    if isinstance(m, nn.Linear):
+        trunc_normal_(m.weight, std=0.02)
+        if isinstance(m, nn.Linear) and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.constant_(m.bias, 0)
+        nn.init.constant_(m.weight, 1.0)
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, kernel_size=1, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.tubelet_size = kernel_size
+
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, 
+            kernel_size=(kernel_size, patch_size[0], patch_size[1]),
+            stride=(kernel_size, patch_size[0], patch_size[1])
+        )
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+    
+
+class VisionMamba(nn.Module):
+    def __init__(
+            self, 
+            img_size=224, 
+            patch_size=16, 
+            depth=24, 
+            embed_dim=192, 
+            channels=3, 
+            num_classes=1000,
+            drop_rate=0.,
+            drop_path_rate=0.1,
+            ssm_cfg=None, 
+            norm_epsilon=1e-5, 
+            initializer_cfg=None,
+            fused_add_norm=True,
+            rms_norm=True, 
+            residual_in_fp32=True,
+            bimamba=True,
+            # video
+            kernel_size=1, 
+            num_frames=8, 
+            fc_drop_rate=0., 
+            device=None,
+            dtype=None,
+            # checkpoint
+            use_checkpoint=False,
+            checkpoint_num=0,
+        ):
+        factory_kwargs = {"device": device, "dtype": dtype} # follow MambaLMHeadModel
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.use_checkpoint = use_checkpoint
+        self.checkpoint_num = checkpoint_num
+        print(f'Use checkpoint: {use_checkpoint}')
+        print(f'Checkpoint number: {checkpoint_num}')
+
+        # pretrain parameters
+        self.num_classes = num_classes
+        self.d_model = self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, 
+            kernel_size=kernel_size,
+            in_chans=channels, embed_dim=embed_dim
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, self.embed_dim))
+        self.temporal_pos_embedding = nn.Parameter(torch.zeros(1, num_frames // kernel_size, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        self.head_drop = nn.Dropout(fc_drop_rate) if fc_drop_rate > 0 else nn.Identity()
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        inter_dpr = [0.0] + dpr
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        # mamba blocks
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    embed_dim,
+                    ssm_cfg=ssm_cfg,
+                    norm_epsilon=norm_epsilon,
+                    rms_norm=rms_norm,
+                    residual_in_fp32=residual_in_fp32,
+                    fused_add_norm=fused_add_norm,
+                    layer_idx=i,
+                    bimamba=bimamba,
+                    drop_path=inter_dpr[i],
+                    **factory_kwargs,
+                )
+                for i in range(depth)
+            ]
+        )
+        
+        # output head
+        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(embed_dim, eps=norm_epsilon, **factory_kwargs)
+
+        # original init
+        self.apply(segm_init_weights)
+        self.head.apply(segm_init_weights)
+        trunc_normal_(self.pos_embed, std=.02)
+
+        # mamba init
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=depth,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token", "temporal_pos_embedding"}
+    
+    def get_num_layers(self):
+        return len(self.layers)
+
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=""):
+        _load_weights(self, checkpoint_path, prefix)
+
+    def forward_features(self, x, inference_params=None):
+        x = self.patch_embed(x)
+        B, C, T, H, W = x.shape
+        x = x.permute(0, 2, 3, 4, 1).reshape(B * T, H * W, C)
+
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_token, x), dim=1)
+        x = x + self.pos_embed
+
+        # temporal pos
+        cls_tokens = x[:B, :1, :]
+        x = x[:, 1:]
+        x = rearrange(x, '(b t) n m -> (b n) t m', b=B, t=T)
+        x = x + self.temporal_pos_embedding
+        x = rearrange(x, '(b n) t m -> b (t n) m', b=B, t=T)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        x = self.pos_drop(x)
+
+        # mamba impl
+        residual = None
+        hidden_states = x
+        for idx, layer in enumerate(self.layers):
+            if self.use_checkpoint and idx < self.checkpoint_num:
+                hidden_states, residual = layer(
+                    hidden_states, residual, inference_params=inference_params,
+                    use_checkpoint=True
+                )
+            else:
+                hidden_states, residual = layer(
+                    hidden_states, residual, inference_params=inference_params
+                )
+
+        if not self.fused_add_norm:
+            if residual is None:
+                residual = hidden_states
+            else:
+                residual = residual + self.drop_path(hidden_states)
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            # Set prenorm=False here since we don't need the residual
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            hidden_states = fused_add_norm_fn(
+                self.drop_path(hidden_states),
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+
+        # return only cls token
+        return hidden_states[:, 0, :]
+
+    def forward(self, x, inference_params=None):
+        x = self.forward_features(x, inference_params)
+        x = self.head(self.head_drop(x))
+        return x
+
+
+def inflate_weight(weight_2d, time_dim, center=True):
+    print(f'Init center: {center}')
+    if center:
+        weight_3d = torch.zeros(*weight_2d.shape)
+        weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
+        middle_idx = time_dim // 2
+        weight_3d[:, :, middle_idx, :, :] = weight_2d
+    else:
+        weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
+        weight_3d = weight_3d / time_dim
+    return weight_3d
+
+
+def load_state_dict(model, state_dict, center=True):
+    state_dict_3d = model.state_dict()
+    for k in state_dict.keys():
+        if k in state_dict_3d.keys() and state_dict[k].shape != state_dict_3d[k].shape:
+            if len(state_dict_3d[k].shape) <= 3:
+                print(f'Ignore: {k}')
+                continue
+            print(f'Inflate: {k}, {state_dict[k].shape} => {state_dict_3d[k].shape}')
+            time_dim = state_dict_3d[k].shape[2]
+            state_dict[k] = inflate_weight(state_dict[k], time_dim, center=center)
+    
+    del state_dict['head.weight']
+    del state_dict['head.bias']
+    msg = model.load_state_dict(state_dict, strict=False)
+    print(msg)
+
+
+def videomamba_tiny(**kwargs):
+    model = VisionMamba(
+        patch_size=16, 
+        embed_dim=192, 
+        depth=24, 
+        rms_norm=True, 
+        residual_in_fp32=True, 
+        fused_add_norm=True, 
+        **kwargs
+    )
+    model.default_cfg = _cfg()
+    return model
+
+
+def videomamba_small(**kwargs):
+    model = VisionMamba(
+        patch_size=16, 
+        embed_dim=384, 
+        depth=24, 
+        rms_norm=True, 
+        residual_in_fp32=True, 
+        fused_add_norm=True, 
+        **kwargs
+    )
+    model.default_cfg = _cfg()
+    return model
+
+
+def videomamba_middle(**kwargs):
+    model = VisionMamba(
+        patch_size=16, 
+        embed_dim=576, 
+        depth=32, 
+        rms_norm=True, 
+        residual_in_fp32=True, 
+        fused_add_norm=True, 
+        **kwargs
+    )
+    return model
+
diff --git a/videos/hitting_baseball.mp4 b/videos/hitting_baseball.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..acfdfcba8c7ee9aca9b73c6a3296efd29c2c9c3f
Binary files /dev/null and b/videos/hitting_baseball.mp4 differ
diff --git a/videos/hoverboarding.mp4 b/videos/hoverboarding.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..3bff7eb496f7f69976a88fb06def1a8bce3f38c4
Binary files /dev/null and b/videos/hoverboarding.mp4 differ
diff --git a/videos/yoga.mp4 b/videos/yoga.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..7ad10e5d0162c2117da0aa36fb1e79483967d646
Binary files /dev/null and b/videos/yoga.mp4 differ